salmon_align/
lib.rs

1//! `salmon-align`: alignment-based (BAM) quantification.
2//!
3//! The alternative to mapping FASTQ reads: take a BAM of reads already aligned
4//! to the *transcriptome* (each `@SQ` is a transcript) and quantify directly
5//! from those alignments. References and their lengths come from the BAM
6//! header, so no index or FASTA is required for basic quantification.
7//!
8//! For each fragment (the contiguous run of records sharing a read name) the
9//! set of transcripts it aligns to becomes an equivalence class, weighted by
10//! alignment score (`AS`) via the same `exp(-scoreExp·(best−score))` rule used
11//! in the mapping path. Fragment lengths (`TLEN`) feed the fragment-length
12//! distribution. The class set then flows through the shared EM
13//! ([`salmon_infer`]) to `quant.sf`. Mirrors salmon's `quant -a` mode (the
14//! position-binned alignment error model is a later refinement).
15
16mod error_model;
17
18use std::collections::HashMap;
19use std::io::{BufRead, Write};
20use std::path::{Path, PathBuf};
21
22use anyhow::{Context, Result};
23use noodles_bam as bam;
24use noodles_sam as sam;
25use noodles_sam::alignment::record::cigar::op::Kind;
26use noodles_sam::alignment::record::data::field::{Tag, Value};
27use noodles_sam::Header;
28use serde::Serialize;
29
30use error_model::{AlignmentModel, AlnOp, SharedAlignmentModel};
31use salmon_core::{
32    is_compatible, LibraryFormat, MateStatus, ReadOrientation, ReadStrandedness, ReadType,
33};
34use salmon_eqclass::{range_factorize_bins, EquivalenceClassBuilder, TranscriptGroup};
35use salmon_infer::{optimize, optimize_with_init, EmOptions};
36use salmon_model::FragmentLengthDistribution;
37
38const SALMON_VERSION: &str = env!("CARGO_PKG_VERSION");
39
40/// Options for alignment-based quantification.
41#[derive(Debug, Clone)]
42pub struct AlignQuantOptions {
43    /// BAM of alignments to the transcriptome (records grouped by read name)
44    pub bam: PathBuf,
45    /// output directory
46    pub output_dir: PathBuf,
47    /// library type string (recorded in output)
48    pub lib_type: String,
49    /// EM/VBEM options
50    pub em: EmOptions,
51    /// range-factorization bins (0 disables)
52    pub range_factorization_bins: u32,
53    /// soft-weight decay applied to alignment-score differences
54    pub score_exp: f64,
55    /// transcriptome FASTA (`-t`); required to train the alignment error model
56    pub transcripts: Option<PathBuf>,
57    /// disable the alignment error model (salmon's `--noErrorModel`)
58    pub no_error_model: bool,
59    /// enable sequence-specific bias correction (`--seqBias`)
60    pub seq_bias: bool,
61    /// enable fragment-GC bias correction (`--gcBias`)
62    pub gc_bias: bool,
63    /// enable positional bias correction (`--posBias`)
64    pub pos_bias: bool,
65    /// weight multiplier for orientation-incompatible alignments; `0` drops them
66    /// (salmon's default `ignoreIncompat` behavior)
67    pub incompat_prior: f64,
68    /// fragment-length distribution prior mean, SD, and max tracked length
69    /// (`--fldMean` / `--fldSD` / `--fldMax`)
70    pub fld_mean: f64,
71    pub fld_sd: f64,
72    pub fld_max: usize,
73    /// online-phase forgetting factor (`--forgettingFactor`, salmon default 0.65)
74    pub forgetting_factor: f64,
75    /// initialize the EM uniformly instead of with the online-estimate-blended
76    /// warm start (`--initUniform`)
77    pub init_uniform: bool,
78    /// significant digits for the EffectiveLength and NumReads columns of
79    /// `quant.sf` (`--sigDigits`, salmon default 3)
80    pub sig_digits: u32,
81    /// number of read-position bins in the alignment error model
82    /// (`--numErrorBins`, salmon default 4)
83    pub num_error_bins: usize,
84    /// drop orphan (single-mate) placements in a paired library instead of
85    /// fragment-length-penalizing them (`--discardOrphans`, alignment mode)
86    pub discard_orphans: bool,
87    /// fragment-length sampling stride for the GC bias convolution
88    /// (`--biasSpeedSamp`, default 5)
89    pub bias_speed_samp: usize,
90    /// online-phase auxiliary-model training window (`--numAuxModelSamples`,
91    /// default 5,000,000)
92    pub num_aux_model_samples: u64,
93    /// disable the lower barrier on bias-corrected effective lengths
94    /// (`--noBiasLengthThreshold`)
95    pub no_bias_length_threshold: bool,
96    /// GC bias model bin counts (`--numGCBins` × `--conditionalGCBins`, 25×3)
97    pub gc_bins: usize,
98    pub cond_gc_bins: usize,
99    /// skip abundance estimation + `quant.sf`, emitting only eq-classes,
100    /// library type, and metadata (salmon's `--skipQuant`)
101    pub skip_quant: bool,
102    /// fragments processed before the FLD aux model is applied
103    /// (salmon's `--numPreAuxModelSamples`; prior hardcoded value 5,000)
104    pub num_pre_aux_model_samples: u64,
105    /// Optional shared progress counters. When `Some`, the BAM pass reports
106    /// processed/mapped fragment counts here as it runs so the caller can drive
107    /// a live progress display. `None` (the default) disables sharing.
108    pub progress: Option<std::sync::Arc<salmon_core::ProgressCounters>>,
109}
110
111impl AlignQuantOptions {
112    pub fn new(bam: PathBuf, output_dir: PathBuf) -> Self {
113        Self {
114            bam,
115            output_dir,
116            lib_type: "IU".to_string(),
117            em: EmOptions::default(),
118            range_factorization_bins: 4,
119            score_exp: 1.0,
120            transcripts: None,
121            no_error_model: false,
122            seq_bias: false,
123            gc_bias: false,
124            pos_bias: false,
125            incompat_prior: 0.0,
126            fld_mean: 250.0,
127            fld_sd: 25.0,
128            fld_max: 1000,
129            forgetting_factor: 0.65,
130            init_uniform: false,
131            sig_digits: 3,
132            num_error_bins: 4,
133            discard_orphans: false,
134            bias_speed_samp: 5,
135            num_aux_model_samples: 5_000_000,
136            no_bias_length_threshold: false,
137            gc_bins: salmon_model::gcbias::DEFAULT_GC_BINS,
138            cond_gc_bins: salmon_model::gcbias::DEFAULT_COND_BINS,
139            skip_quant: false,
140            num_pre_aux_model_samples: 5_000,
141            progress: None,
142        }
143    }
144}
145
146/// Quantification results.
147#[derive(Debug, Clone)]
148pub struct AlignQuantResult {
149    pub names: Vec<String>,
150    pub lengths: Vec<u32>,
151    pub eff_lengths: Vec<f64>,
152    pub tpm: Vec<f64>,
153    pub counts: Vec<f64>,
154    pub num_processed: u64,
155    pub num_mapped: u64,
156    pub num_eq_classes: usize,
157    pub frag_len_mean: f64,
158    pub frag_len_sd: f64,
159    pub length_classes: Vec<u32>,
160    pub frag_len_dist: Vec<f64>,
161    pub start_time: String,
162    pub bias_dump: salmon_model::dumps::BiasDump,
163    /// per-transcript (unique, ambiguous) fragment counts for `ambig_info.tsv`
164    pub ambig: (Vec<u32>, Vec<u32>),
165}
166
167/// Current local time as an asctime-style string, matching salmon's timestamps.
168fn asctime_now() -> String {
169    jiff::Zoned::now()
170        .strftime("%a %b %e %H:%M:%S %Y")
171        .to_string()
172}
173
174/// Extract an integer tag value (e.g. `AS`) as `i32`.
175fn value_as_i32(v: &Value) -> Option<i32> {
176    match v {
177        Value::Int8(x) => Some(*x as i32),
178        Value::UInt8(x) => Some(*x as i32),
179        Value::Int16(x) => Some(*x as i32),
180        Value::UInt16(x) => Some(*x as i32),
181        Value::Int32(x) => Some(*x),
182        Value::UInt32(x) => Some(*x as i32),
183        _ => None,
184    }
185}
186
187/// One placement of the fragment on a single transcript: the record indices the
188/// *aligner* reported together — a proper pair (two mates that point at each
189/// other) or a single orphan record. Indices are sorted by reference position
190/// (so index 0 is the left mate).
191struct Placement {
192    tid: u32,
193    idxs: Vec<usize>,
194}
195
196/// Resolve a fragment's records into the placements the aligner actually
197/// intended, rather than cross-producting every read1 with every read2 on a
198/// transcript.
199///
200/// A permissive aligner (e.g. `bowtie2 -k`) reports many alignment records per
201/// fragment; the two mates of one reported pair are linked by their mate fields
202/// — each record's `RNEXT`/`PNEXT` points at the other's `(tid, pos)` — and, when
203/// present, share a hit index (`HI`). Pairing by transcript co-occurrence instead
204/// fabricates concordant pairs the aligner never reported, which keeps a fragment
205/// artificially orientation-compatible and defeats salmon's dropping of
206/// protocol-inconsistent fragments. Here we pair a read1 record with the read2
207/// record that reciprocally references it (and matches its `HI` when both carry
208/// one); records left unpaired — including all records of a single-end library —
209/// become orphan placements.
210fn pair_records(recs: &[FragRecord]) -> Vec<Placement> {
211    let n = recs.len();
212    let mut used = vec![false; n];
213    let mut placements: Vec<Placement> = Vec::with_capacity(n);
214
215    for i in 0..n {
216        if used[i] || !recs[i].is_read1 {
217            continue;
218        }
219        let r1 = &recs[i];
220        // Only mates aligned to the *same* transcript form a single-transcript
221        // pair placement; a mate on another transcript leaves r1 an orphan.
222        let (Some(mtid), Some(mpos)) = (r1.mate_tid, r1.mate_pos) else {
223            continue;
224        };
225        if mtid != r1.tid {
226            continue;
227        }
228        let mut mate: Option<usize> = None;
229        for j in 0..n {
230            if used[j] || recs[j].is_read1 {
231                continue;
232            }
233            let r2 = &recs[j];
234            if r2.tid != mtid
235                || r2.pos != mpos
236                || r2.mate_tid != Some(r1.tid)
237                || r2.mate_pos != Some(r1.pos)
238            {
239                continue;
240            }
241            // A reciprocal coordinate match; prefer one whose HI agrees.
242            let hi_ok = matches!((r1.hi, r2.hi), (Some(a), Some(b)) if a == b)
243                || r1.hi.is_none()
244                || r2.hi.is_none();
245            if hi_ok {
246                mate = Some(j);
247                break;
248            }
249            if mate.is_none() {
250                mate = Some(j);
251            }
252        }
253        if let Some(j) = mate {
254            used[i] = true;
255            used[j] = true;
256            let mut idxs = vec![i, j];
257            idxs.sort_by_key(|&k| recs[k].pos);
258            placements.push(Placement { tid: r1.tid, idxs });
259        }
260    }
261    for i in 0..n {
262        if !used[i] {
263            placements.push(Placement {
264                tid: recs[i].tid,
265                idxs: vec![i],
266            });
267        }
268    }
269    placements
270}
271
272/// `log(Σ exp(xs))`, numerically stable. `xs` is non-empty.
273fn logsumexp(xs: &[f64]) -> f64 {
274    let m = xs.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
275    if m == f64::NEG_INFINITY {
276        return f64::NEG_INFINITY;
277    }
278    m + xs.iter().map(|&x| (x - m).exp()).sum::<f64>().ln()
279}
280
281/// Derive the observed library format, the single-read forward flag, and the
282/// mate status for one reported pair / orphan (`recs[idxs]`), mirroring salmon's
283/// `hitType` (`src/util/SalmonUtils.cpp`).
284///
285/// salmon classifies orientation from the **leftmost** reference coordinate
286/// (`bam_pos`) of each mate — *not* their 5' ends: a forward/reverse pair is
287/// inward (TOWARD) iff the forward mate starts at or before the reverse mate, and
288/// outward (AWAY) otherwise. For a dovetailed/overlapping short fragment where the
289/// reverse mate's leftmost precedes the forward mate's, salmon therefore reports
290/// the pair as outward and (under a strict library type) drops it as a
291/// zero-probability fragment. We match that convention exactly. Strandedness is
292/// keyed on which mate is forward (read 1 forward → SA, read 2 forward → AS).
293fn frag_format(recs: &[FragRecord], idxs: &[usize]) -> (Option<LibraryFormat>, bool, MateStatus) {
294    if idxs.len() >= 2 {
295        let r1 = idxs
296            .iter()
297            .map(|&i| &recs[i])
298            .find(|r| r.is_read1)
299            .unwrap_or(&recs[idxs[0]]);
300        let r2 = idxs
301            .iter()
302            .map(|&i| &recs[i])
303            .find(|r| !r.is_read1)
304            .unwrap_or(&recs[idxs[1]]);
305        let (r1_fw, r2_fw) = (!r1.is_reverse, !r2.is_reverse);
306        let (orientation, strandedness) = if r1_fw != r2_fw {
307            let (fw, rc) = if r1_fw { (r1, r2) } else { (r2, r1) };
308            let orientation = if fw.pos <= rc.pos {
309                ReadOrientation::Toward
310            } else {
311                ReadOrientation::Away
312            };
313            let strandedness = if r1_fw {
314                ReadStrandedness::SA
315            } else {
316                ReadStrandedness::AS
317            };
318            (orientation, strandedness)
319        } else {
320            (
321                ReadOrientation::Same,
322                if r1_fw {
323                    ReadStrandedness::S
324                } else {
325                    ReadStrandedness::A
326                },
327            )
328        };
329        (
330            Some(LibraryFormat::new(
331                ReadType::PairedEnd,
332                orientation,
333                strandedness,
334            )),
335            r1_fw,
336            MateStatus::PairedEndPaired,
337        )
338    } else {
339        let r = &recs[idxs[0]];
340        let status = if r.is_read1 {
341            MateStatus::PairedEndLeft
342        } else {
343            MateStatus::PairedEndRight
344        };
345        (None, !r.is_reverse, status)
346    }
347}
348
349/// 2-bit encode a base (`A=0, C=1, G=2, T=3`; anything else → `0`).
350#[inline]
351fn base_2bit(b: u8) -> u8 {
352    match b {
353        b'A' | b'a' => 0,
354        b'C' | b'c' => 1,
355        b'G' | b'g' => 2,
356        b'T' | b't' => 3,
357        _ => 0,
358    }
359}
360
361/// Load a (optionally gzip'd) transcriptome FASTA and return the (ASCII) base
362/// sequences aligned to the BAM's reference order (`names`); a name absent from
363/// the FASTA yields an empty sequence (its model contributions are skipped). The
364/// same bytes feed both the error model (2-bit on the fly) and the bias models.
365fn load_ref_bytes(path: &Path, names: &[String]) -> Result<Vec<Vec<u8>>> {
366    let file = std::fs::File::open(path).with_context(|| format!("opening {}", path.display()))?;
367    let mut magic = [0u8; 2];
368    let is_gz = {
369        use std::io::Read;
370        let mut f = std::fs::File::open(path)?;
371        f.read_exact(&mut magic).is_ok() && magic == [0x1f, 0x8b]
372    };
373    let reader: Box<dyn BufRead> = if is_gz {
374        Box::new(std::io::BufReader::new(flate2::read::MultiGzDecoder::new(
375            file,
376        )))
377    } else {
378        Box::new(std::io::BufReader::new(file))
379    };
380
381    let mut by_name: HashMap<String, Vec<u8>> = HashMap::new();
382    let mut cur_name: Option<String> = None;
383    let mut cur_seq: Vec<u8> = Vec::new();
384    for line in reader.lines() {
385        let line = line?;
386        if let Some(stripped) = line.strip_prefix('>') {
387            if let Some(n) = cur_name.take() {
388                by_name.insert(n, std::mem::take(&mut cur_seq));
389            }
390            cur_name = Some(stripped.split_whitespace().next().unwrap_or("").to_string());
391        } else if cur_name.is_some() {
392            cur_seq.extend(line.trim_end().bytes());
393        }
394    }
395    if let Some(n) = cur_name.take() {
396        by_name.insert(n, cur_seq);
397    }
398    Ok(names
399        .iter()
400        .map(|n| by_name.remove(n).unwrap_or_default())
401        .collect())
402}
403
404/// One alignment record needed by the error model / weighting.
405struct FragRecord {
406    tid: u32,
407    pos: usize,
408    read_2bit: Vec<u8>,
409    ops: Vec<(AlnOp, usize)>,
410    /// aligner `AS` tag; parsed for completeness but not used for weighting —
411    /// the error model's `errLike` drives the conditional weight, matching
412    /// salmon's behavior for CIGAR-bearing aligners (see [`salmon-align-as-weighting`]).
413    #[allow(dead_code)]
414    score: i32,
415    frag_len: i32,
416    /// reverse-strand alignment (BAM `0x10` flag)
417    is_reverse: bool,
418    /// first mate of the pair (BAM `0x40` flag)
419    is_read1: bool,
420    /// reference span (Σ ref-consuming CIGAR op lengths); the read's 3' end on
421    /// the reference is `pos + ref_span − 1`
422    ref_span: usize,
423    /// mate's transcript id (`RNEXT`), if the mate is mapped
424    mate_tid: Option<u32>,
425    /// mate's 0-based alignment start (`PNEXT`), if the mate is mapped
426    mate_pos: Option<usize>,
427    /// hit index (`HI` tag): the aligner's pairing id, used to disambiguate which
428    /// mate records form one reported alignment when several share coordinates
429    hi: Option<i32>,
430}
431
432impl FragRecord {
433    /// The read's 5' reference position: its leftmost if forward, its rightmost
434    /// if reverse-complemented (salmon's `startPos`).
435    #[inline]
436    fn five_prime(&self) -> usize {
437        if self.is_reverse {
438            self.pos + self.ref_span.saturating_sub(1)
439        } else {
440            self.pos
441        }
442    }
443}
444
445/// The canonical read name: a trailing `/1` or `/2` mate suffix stripped (as
446/// salmon's `getPairedNameLen` does), so the two mates of a fragment group
447/// together even when the aligner kept the suffix in the QNAME.
448#[inline]
449fn canonical_name(name: &[u8]) -> &[u8] {
450    let l = name.len();
451    if l > 2 && name[l - 2] == b'/' {
452        &name[..l - 2]
453    } else {
454        name
455    }
456}
457
458/// Map a noodles CIGAR op kind to our `AlnOp`.
459fn kind_to_op(k: Kind) -> AlnOp {
460    match k {
461        Kind::Match => AlnOp::Match,
462        Kind::SequenceMatch => AlnOp::SeqMatch,
463        Kind::SequenceMismatch => AlnOp::SeqMismatch,
464        Kind::Insertion => AlnOp::Ins,
465        Kind::Deletion => AlnOp::Del,
466        Kind::Skip => AlnOp::RefSkip,
467        Kind::SoftClip => AlnOp::SoftClip,
468        Kind::HardClip => AlnOp::HardClip,
469        Kind::Pad => AlnOp::Pad,
470    }
471}
472
473/// Does this path name a SAM file (plain or gzipped) rather than a BAM?
474fn is_sam_path(p: &Path) -> bool {
475    let s = p.to_string_lossy().to_ascii_lowercase();
476    s.ends_with(".sam") || s.ends_with(".sam.gz")
477}
478
479/// Open a SAM file (transparently gunzipping `.sam.gz`) as a noodles reader over
480/// a boxed `BufRead`, so plain and gzipped inputs share one concrete type.
481fn open_sam_reader(path: &Path) -> Result<sam::io::Reader<Box<dyn BufRead + Send>>> {
482    let file = std::fs::File::open(path).with_context(|| format!("opening {}", path.display()))?;
483    let lower = path.to_string_lossy().to_ascii_lowercase();
484    let inner: Box<dyn BufRead + Send> = if lower.ends_with(".gz") {
485        Box::new(std::io::BufReader::new(flate2::read::MultiGzDecoder::new(
486            file,
487        )))
488    } else {
489        Box::new(std::io::BufReader::new(file))
490    };
491    Ok(sam::io::Reader::new(inner))
492}
493
494/// Read just the header from a SAM/BAM input (chosen by extension).
495fn read_alignment_header(path: &Path) -> Result<Header> {
496    if is_sam_path(path) {
497        open_sam_reader(path)?
498            .read_header()
499            .context("reading SAM header")
500    } else {
501        let mut reader = bam::io::Reader::new(
502            std::fs::File::open(path).with_context(|| format!("opening {}", path.display()))?,
503        );
504        reader.read_header().context("reading BAM header")
505    }
506}
507
508/// Convert one mapped alignment record into a `FragRecord`, returning its
509/// canonical read name alongside. `None` skips the record (unmapped / unnamed /
510/// no reference). Works over any noodles `alignment::Record`, so the same logic
511/// serves BAM and SAM input.
512fn record_to_frag<R: sam::alignment::Record>(
513    record: &R,
514    header: &Header,
515    need_seq: bool,
516) -> Option<(Vec<u8>, FragRecord)> {
517    let flags = record.flags().ok()?;
518    if flags.is_unmapped() {
519        return None;
520    }
521    let name = record.name()?;
522    let cname = canonical_name(name.as_ref()).to_vec();
523
524    let tid = record.reference_sequence_id(header)?.ok()?;
525    let pos = record
526        .alignment_start()
527        .and_then(|r| r.ok())
528        .map(|p| p.get() - 1)
529        .unwrap_or(0);
530    let read_2bit = if need_seq {
531        record.sequence().iter().map(base_2bit).collect()
532    } else {
533        Vec::new()
534    };
535    let ops: Vec<(AlnOp, usize)> = record
536        .cigar()
537        .iter()
538        .filter_map(|r| r.ok())
539        .map(|op| (kind_to_op(op.kind()), op.len()))
540        .collect();
541    let ref_span: usize = ops
542        .iter()
543        .filter(|(o, _)| {
544            matches!(
545                o,
546                AlnOp::Match | AlnOp::SeqMatch | AlnOp::SeqMismatch | AlnOp::Del | AlnOp::RefSkip
547            )
548        })
549        .map(|(_, l)| l)
550        .sum();
551    let ops = if need_seq { ops } else { Vec::new() };
552    let score = record
553        .data()
554        .get(&Tag::ALIGNMENT_SCORE)
555        .and_then(|r| r.ok())
556        .and_then(|v| value_as_i32(&v))
557        .unwrap_or(0);
558    let frag_len = record.template_length().map(|t| t.abs()).unwrap_or(0);
559    let is_reverse = flags.is_reverse_complemented();
560    let is_read1 = flags.is_first_segment();
561    // Mate linkage as the aligner recorded it (RNEXT/PNEXT); a mate that is
562    // unmapped or absent leaves these `None`, making the record an orphan.
563    let mate_tid = (!flags.is_mate_unmapped())
564        .then(|| {
565            record
566                .mate_reference_sequence_id(header)
567                .and_then(|r| r.ok())
568        })
569        .flatten()
570        .map(|t| t as u32);
571    let mate_pos = record
572        .mate_alignment_start()
573        .and_then(|r| r.ok())
574        .map(|p| p.get() - 1);
575    let hi = record
576        .data()
577        .get(&Tag::HIT_INDEX)
578        .and_then(|r| r.ok())
579        .and_then(|v| value_as_i32(&v));
580    Some((
581        cname,
582        FragRecord {
583            tid: tid as u32,
584            pos,
585            read_2bit,
586            ops,
587            score,
588            frag_len,
589            is_reverse,
590            is_read1,
591            ref_span,
592            mate_tid,
593            mate_pos,
594            hi,
595        },
596    ))
597}
598
599/// Read-only configuration + shared (thread-safe) sinks for the online pass.
600/// Held across the whole pass; the worker threads borrow it immutably.
601struct PassCfg<'a> {
602    online: Option<&'a salmon_infer::OnlineInference>,
603    fld: &'a FragmentLengthDistribution,
604    eq_builder: &'a EquivalenceClassBuilder,
605    ref_bytes: &'a [Vec<u8>],
606    lengths: &'a [u32],
607    gc_store: salmon_model::GcStore<'a>,
608    length_class: Option<&'a [usize]>,
609    expected_format: Option<LibraryFormat>,
610    ignore_incompat: bool,
611    incompat_prior: f64,
612    paired_lib: bool,
613    /// drop single-mate (orphan) placements in a paired library (`--discardOrphans`)
614    discard_orphans: bool,
615    /// fragments processed before the FLD aux model is applied (`--numPreAuxModelSamples`)
616    pre_burnin: u64,
617    range_factorization_bins: u32,
618    use_error_model: bool,
619    /// number of read-position bins in the alignment error model
620    /// (salmon's `--numErrorBins`)
621    error_bins: usize,
622    seq_bias: bool,
623    gc_bias: bool,
624    /// GC bias model bin counts (`--conditionalGCBins` × `--numGCBins`)
625    cond_gc_bins: usize,
626    gc_bins: usize,
627    pos_bias: bool,
628    need_seq: bool,
629    minibatch: usize,
630    nthreads: usize,
631    /// optional shared live-progress counters (updated per batch)
632    progress: Option<&'a salmon_core::ProgressCounters>,
633}
634
635/// Outputs of the online pass: the observed bias models (merged from the
636/// per-worker accumulators) and the processed/mapped fragment counts. The error
637/// model is developed in a shared atomic structure internal to the pass and is
638/// not needed afterward.
639struct PassAccum {
640    seq_obs: Option<(salmon_model::SBModel, salmon_model::SBModel)>,
641    gc_obs: Option<salmon_model::GcFragModel>,
642    pos_obs: Option<(
643        Vec<salmon_model::SimplePosBias>,
644        Vec<salmon_model::SimplePosBias>,
645    )>,
646    num_processed: u64,
647    num_mapped: u64,
648}
649
650/// The online pass over an alignment record stream, structured as a persistent
651/// producer/worker pool (mirroring salmon's parse-threads → queue →
652/// quant-threads model).
653///
654/// One reader thread does only the cheap work — deserialize each record and
655/// group consecutive mapped records by read name into raw fragment groups — and
656/// pushes minibatches onto a bounded work queue (measured serial floor for this
657/// is ~5% of the pass). `nthreads` persistent worker threads pull minibatches
658/// continuously (no per-batch barrier; the MPMC queue load-balances), and each
659/// does the expensive `record_to_frag` (2-bit encoding, CIGAR/op extraction,
660/// tag parsing, allocation) *and* the per-fragment weighting. Workers read the
661/// shared atomic error model for `basis` and flush their own model delta into it
662/// after each minibatch; bias accumulators are per-worker and merged once at the
663/// end (they are only read after the pass).
664fn run_online_pass<R, I>(
665    records: I,
666    header: &Header,
667    cfg: &PassCfg,
668    acc: &mut PassAccum,
669) -> Result<()>
670where
671    R: sam::alignment::Record + Send + Sync,
672    I: Iterator<Item = std::io::Result<R>> + Send,
673{
674    use crossbeam_channel::bounded;
675
676    // Fragments a worker processes between flushing its error-model delta into
677    // the shared model. Small = fresher shared model (closer to salmon's live
678    // per-transition training, better parity) but more atomic contention on the
679    // hot match cell; 1 = per-fragment.
680    const FLUSH_INTERVAL: usize = 64;
681
682    // Shared atomic error model: read concurrently for `basis`, updated by the
683    // workers flushing their per-thread deltas into it between minibatches.
684    let shared_model = cfg
685        .use_error_model
686        .then(|| SharedAlignmentModel::new(1.0, cfg.error_bins));
687    let shared_model_ref = shared_model.as_ref();
688    let minibatch = cfg.minibatch;
689    let need_seq = cfg.need_seq;
690    // Each batch carries its forgetting-mass step, assigned by the reader in
691    // batch order (so the online schedule is tied to the minibatch index, as in
692    // salmon, rather than to nondeterministic worker-pull order).
693    let (tx, rx) = bounded::<(f64, Vec<Vec<R>>)>(2 * cfg.nthreads + 1);
694    let online_reader = cfg.online;
695
696    std::thread::scope(|scope| -> Result<()> {
697        // Reader/parser thread.
698        let reader = scope.spawn(move || -> Result<()> {
699            let mut cur_name: Vec<u8> = Vec::new();
700            let mut have = false;
701            let mut group: Vec<R> = Vec::new();
702            let mut batch: Vec<Vec<R>> = Vec::with_capacity(minibatch);
703            for result in records {
704                let rec = result.context("reading alignment record")?;
705                if rec.flags().map(|f| f.is_unmapped()).unwrap_or(true) {
706                    continue;
707                }
708                let Some(name) = rec.name() else { continue };
709                let cname = canonical_name(name.as_ref()).to_vec();
710                if !have {
711                    cur_name = cname;
712                    have = true;
713                } else if cname != cur_name {
714                    batch.push(std::mem::take(&mut group));
715                    cur_name = cname;
716                    if batch.len() >= minibatch {
717                        let fm = online_reader.map(|o| o.next_log_fm()).unwrap_or(0.0);
718                        if tx.send((fm, std::mem::take(&mut batch))).is_err() {
719                            return Ok(()); // all workers gone
720                        }
721                        batch = Vec::with_capacity(minibatch);
722                    }
723                }
724                group.push(rec);
725            }
726            if have && !group.is_empty() {
727                batch.push(group);
728            }
729            if !batch.is_empty() {
730                let fm = online_reader.map(|o| o.next_log_fm()).unwrap_or(0.0);
731                let _ = tx.send((fm, batch));
732            }
733            Ok(())
734        });
735
736        // Persistent worker pool: each pulls minibatches until the queue closes.
737        let mut workers = Vec::with_capacity(cfg.nthreads);
738        for _ in 0..cfg.nthreads {
739            let rx = rx.clone();
740            workers.push(scope.spawn(move || -> (Local, u64) {
741                let mut local = Local::new(
742                    cfg.use_error_model,
743                    cfg.error_bins,
744                    cfg.seq_bias,
745                    cfg.gc_bias,
746                    cfg.cond_gc_bins,
747                    cfg.gc_bins,
748                    cfg.pos_bias,
749                );
750                let mut count = 0u64;
751                let mut frags: Vec<FragRecord> = Vec::new();
752                while let Ok((log_fm, raw_batch)) = rx.recv() {
753                    let ctx = FragCtx {
754                        model: shared_model_ref,
755                        online: cfg.online,
756                        fld: cfg.fld,
757                        eq_builder: cfg.eq_builder,
758                        ref_bytes: cfg.ref_bytes,
759                        lengths: cfg.lengths,
760                        gc_store: cfg.gc_store,
761                        length_class: cfg.length_class,
762                        expected_format: cfg.expected_format,
763                        ignore_incompat: cfg.ignore_incompat,
764                        incompat_prior: cfg.incompat_prior,
765                        paired_lib: cfg.paired_lib,
766                        discard_orphans: cfg.discard_orphans,
767                        pre_burnin: cfg.pre_burnin,
768                        range_factorization_bins: cfg.range_factorization_bins,
769                        log_fm,
770                    };
771                    let mut since_flush = 0usize;
772                    for raw_group in &raw_batch {
773                        frags.clear();
774                        for r in raw_group {
775                            if let Some((_, f)) = record_to_frag(r, header, need_seq) {
776                                frags.push(f);
777                            }
778                        }
779                        process_fragment(&frags, &ctx, &mut local);
780                        // Publish the error-model delta into the shared model
781                        // every FLUSH_INTERVAL fragments so other workers' `basis`
782                        // sees fresh-enough training. The update granularity is
783                        // what governs parity with salmon's live per-transition
784                        // model: ~per-fragment freshness recovers it, while the
785                        // batched flush keeps hot-cell atomic contention low.
786                        since_flush += 1;
787                        if since_flush >= FLUSH_INTERVAL {
788                            if let (Some(sm), Some(d)) = (shared_model_ref, local.model.as_mut()) {
789                                sm.flush_from(d);
790                                d.clear();
791                            }
792                            since_flush = 0;
793                        }
794                    }
795                    if since_flush > 0 {
796                        if let (Some(sm), Some(d)) = (shared_model_ref, local.model.as_mut()) {
797                            sm.flush_from(d);
798                            d.clear();
799                        }
800                    }
801                    count += raw_batch.len() as u64;
802                    // Live progress: alignment mode treats every fragment in the
803                    // BAM as processed+mapped (see the totals below), so bump both.
804                    if let Some(p) = cfg.progress {
805                        let n = raw_batch.len() as u64;
806                        p.processed
807                            .fetch_add(n, std::sync::atomic::Ordering::Relaxed);
808                        p.mapped.fetch_add(n, std::sync::atomic::Ordering::Relaxed);
809                    }
810                }
811                (local, count)
812            }));
813        }
814        drop(rx); // workers hold their own clones; lets the queue disconnect
815
816        reader
817            .join()
818            .map_err(|_| anyhow::anyhow!("alignment reader thread panicked"))??;
819
820        // Merge the per-worker bias accumulators + counts.
821        let mut merged = Local::new(
822            cfg.use_error_model,
823            cfg.error_bins,
824            cfg.seq_bias,
825            cfg.gc_bias,
826            cfg.cond_gc_bins,
827            cfg.gc_bins,
828            cfg.pos_bias,
829        );
830        let mut total = 0u64;
831        for w in workers {
832            let (local, count) = w
833                .join()
834                .map_err(|_| anyhow::anyhow!("alignment worker thread panicked"))?;
835            merged = merged.merge(local);
836            total += count;
837        }
838        acc.seq_obs = merged.seq_obs;
839        acc.gc_obs = merged.gc_obs;
840        acc.pos_obs = merged.pos_obs;
841        acc.num_processed = total;
842        acc.num_mapped = total;
843        Ok(())
844    })
845}
846
847/// Dispatch the online pass over a SAM/BAM file (chosen by extension). BAM is
848/// decoded on a small BGZF worker pool (the framing/parse stays serial on the
849/// reader thread, but the decompression overlaps it).
850fn stream_online_pass(bam_path: &Path, cfg: &PassCfg, acc: &mut PassAccum) -> Result<()> {
851    if is_sam_path(bam_path) {
852        let mut reader = open_sam_reader(bam_path)?;
853        let header = reader.read_header().context("reading SAM header")?;
854        run_online_pass(reader.records(), &header, cfg, acc)
855    } else {
856        let file = std::fs::File::open(bam_path)
857            .with_context(|| format!("opening {}", bam_path.display()))?;
858        let workers = std::thread::available_parallelism()
859            .map(|n| n.get())
860            .unwrap_or(4)
861            .clamp(1, 4);
862        let workers = std::num::NonZeroUsize::new(workers).unwrap();
863        let decoder = noodles_bgzf::io::MultithreadedReader::with_worker_count(workers, file);
864        let mut reader = bam::io::Reader::from(decoder);
865        let header = reader.read_header().context("reading BAM header")?;
866        run_online_pass(reader.records(), &header, cfg, acc)
867    }
868}
869
870/// Read-only shared state for processing one fragment. Held by `&` so a whole
871/// minibatch of fragments can be processed in parallel against the model and
872/// abundance state as of the *previous* batch (salmon's minibatch staleness).
873struct FragCtx<'a> {
874    /// shared error model, read concurrently for `basis` (workers flush their
875    /// own deltas into it between minibatches)
876    model: Option<&'a SharedAlignmentModel>,
877    online: Option<&'a salmon_infer::OnlineInference>,
878    fld: &'a FragmentLengthDistribution,
879    eq_builder: &'a EquivalenceClassBuilder,
880    ref_bytes: &'a [Vec<u8>],
881    lengths: &'a [u32],
882    gc_store: salmon_model::GcStore<'a>,
883    length_class: Option<&'a [usize]>,
884    expected_format: Option<LibraryFormat>,
885    ignore_incompat: bool,
886    incompat_prior: f64,
887    paired_lib: bool,
888    /// drop single-mate (orphan) placements in a paired library (`--discardOrphans`)
889    discard_orphans: bool,
890    /// fragments processed before the FLD aux model is applied (`--numPreAuxModelSamples`)
891    pre_burnin: u64,
892    range_factorization_bins: u32,
893    /// this batch's forgetting mass (online phase)
894    log_fm: f64,
895}
896
897/// Per-thread accumulators for the error model and bias models. Each worker
898/// folds its fragments into a `Local` (the error-model matrices seeded empty so
899/// the pseudocount lives only in the global); the per-batch reduction merges
900/// these and the result is folded into the globals between minibatches.
901struct Local {
902    model: Option<AlignmentModel>,
903    seq_obs: Option<(salmon_model::SBModel, salmon_model::SBModel)>,
904    gc_obs: Option<salmon_model::GcFragModel>,
905    pos_obs: Option<(
906        Vec<salmon_model::SimplePosBias>,
907        Vec<salmon_model::SimplePosBias>,
908    )>,
909}
910
911impl Local {
912    #[allow(clippy::too_many_arguments)]
913    fn new(
914        error_model: bool,
915        error_bins: usize,
916        seq_bias: bool,
917        gc_bias: bool,
918        cond_gc_bins: usize,
919        gc_bins: usize,
920        pos_bias: bool,
921    ) -> Self {
922        let mk = || {
923            (0..salmon_model::NUM_LENGTH_CLASSES)
924                .map(|_| salmon_model::SimplePosBias::default())
925                .collect::<Vec<_>>()
926        };
927        Self {
928            model: error_model.then(|| AlignmentModel::empty(error_bins)),
929            seq_obs: seq_bias.then(|| (salmon_model::SBModel::new(), salmon_model::SBModel::new())),
930            gc_obs: gc_bias.then(|| salmon_model::GcFragModel::new(cond_gc_bins, gc_bins)),
931            pos_obs: pos_bias.then(|| (mk(), mk())),
932        }
933    }
934
935    fn merge(mut self, other: Self) -> Self {
936        if let (Some(a), Some(b)) = (self.model.as_mut(), other.model.as_ref()) {
937            a.combine(b);
938        }
939        if let (Some(a), Some(b)) = (self.seq_obs.as_mut(), other.seq_obs.as_ref()) {
940            a.0.combine_counts(&b.0);
941            a.1.combine_counts(&b.1);
942        }
943        if let (Some(a), Some(b)) = (self.gc_obs.as_mut(), other.gc_obs.as_ref()) {
944            a.combine_counts(b);
945        }
946        if let (Some(a), Some(b)) = (self.pos_obs.as_mut(), other.pos_obs.as_ref()) {
947            for (x, y) in a.0.iter_mut().zip(&b.0) {
948                x.combine(y);
949            }
950            for (x, y) in a.1.iter_mut().zip(&b.1) {
951                x.combine(y);
952            }
953        }
954        self
955    }
956}
957
958/// Process one fragment (a group of records sharing a read name): pair its
959/// records into reported placements, compute each placement's conditional
960/// log-weight, build the equivalence class, develop online abundances, and
961/// (during burn-in) accumulate the error-model and bias deltas into `local`.
962/// Pure with respect to shared state except for the concurrency-safe sinks
963/// (`fld`, `online`, `eq_builder`), so it is safe to run in parallel.
964fn process_fragment(recs: &[FragRecord], ctx: &FragCtx, local: &mut Local) {
965    use salmon_model::seqbias::{CONTEXT_LEFT, CONTEXT_LENGTH, CONTEXT_RIGHT};
966    // salmon's LOG_EPSILON = log(0.375e-10): the orphan / implausible-length penalty.
967    const LOG_EPSILON: f64 = -23.998_158_637_57;
968
969    // Pair records into the placements the aligner reported (proper pairs +
970    // orphans), NOT a cross-product of every read1/read2 on a transcript.
971    let placements = pair_records(recs);
972    let frag_len = recs.iter().map(|r| r.frag_len).max().unwrap_or(0);
973    if frag_len > 0 {
974        ctx.fld.add_val(frag_len as usize, 0.0);
975    }
976    let use_aux = ctx
977        .online
978        .is_none_or(|o| o.num_assigned() >= ctx.pre_burnin);
979
980    // Per surviving placement (one reported alignment): conditional log-weight
981    // (eq-class) + online log-aux + fragment geometry.
982    let mut sp_tid: Vec<u32> = Vec::with_capacity(placements.len());
983    let mut sp_eq: Vec<f64> = Vec::with_capacity(placements.len());
984    let mut sp_online: Vec<f64> = Vec::with_capacity(placements.len());
985    let mut sp_geom: Vec<(usize, usize, bool)> = Vec::with_capacity(placements.len());
986    let mut sp_pl: Vec<usize> = Vec::with_capacity(placements.len());
987    for (pi, pl) in placements.iter().enumerate() {
988        let tid = pl.tid;
989        let idxs = &pl.idxs;
990        // --discardOrphans: in a paired library, drop single-mate placements
991        // entirely instead of fragment-length-penalizing them below.
992        if ctx.discard_orphans && ctx.paired_lib && idxs.len() < 2 {
993            continue;
994        }
995        let refseq = ctx
996            .ref_bytes
997            .get(tid as usize)
998            .map(|v| v.as_slice())
999            .unwrap_or(&[]);
1000        // Conditional log-weight basis = salmon's `errLike` (Σ(fg−bg) over the
1001        // mate(s) under the error model; uniform 0.0 when it is disabled).
1002        let basis = if let Some(m) = ctx.model {
1003            if refseq.is_empty() {
1004                0.0
1005            } else {
1006                let mut ll = 0.0;
1007                for (rank, &i) in idxs.iter().enumerate() {
1008                    let r = &recs[i];
1009                    let (fg, bg) = m.log_likelihood(&r.read_2bit, refseq, r.pos, &r.ops, rank == 0);
1010                    ll += fg - bg;
1011                }
1012                ll
1013            }
1014        } else {
1015            0.0
1016        };
1017        let rl = ctx.lengths[tid as usize] as i32;
1018        let flen = idxs.iter().map(|&i| recs[i].frag_len).max().unwrap_or(0);
1019        let proper = idxs.len() >= 2 && flen > 0;
1020        let frag_start = recs[idxs[0]].pos;
1021        let frag_end = frag_start + (flen.max(1) as usize) - 1;
1022        let start_pos = if proper && flen <= rl {
1023            -(((rl - flen + 1) as f64).ln())
1024        } else {
1025            -((rl.max(1) as f64).ln())
1026        };
1027        let log_frag_prob = if proper {
1028            if use_aux {
1029                ctx.fld.pmf(flen as usize)
1030            } else {
1031                0.0
1032            }
1033        } else if ctx.paired_lib {
1034            LOG_EPSILON
1035        } else {
1036            0.0
1037        };
1038        let mut aux = basis + log_frag_prob;
1039        if let Some(exp) = ctx.expected_format {
1040            let (obs, is_fw, status) = frag_format(recs, idxs);
1041            if !is_compatible(exp, obs, is_fw, status) {
1042                if ctx.ignore_incompat {
1043                    continue; // this placement contributes nothing
1044                }
1045                aux += ctx.incompat_prior.ln();
1046            }
1047        }
1048        sp_tid.push(tid);
1049        sp_eq.push(aux);
1050        sp_online.push(aux + start_pos);
1051        sp_geom.push((frag_start, frag_end, proper));
1052        sp_pl.push(pi);
1053    }
1054    // a fragment whose every reported alignment was incompatible is a
1055    // zero-probability fragment: it is not assigned and joins no eq-class.
1056    if sp_tid.is_empty() {
1057        return;
1058    }
1059
1060    // Aggregate surviving placements by distinct transcript id (sorted).
1061    let mut agg: std::collections::BTreeMap<u32, Vec<usize>> = std::collections::BTreeMap::new();
1062    for (k, &t) in sp_tid.iter().enumerate() {
1063        agg.entry(t).or_default().push(k);
1064    }
1065    let tids: Vec<u32> = agg.keys().cloned().collect();
1066    let eq_log: Vec<f64> = agg
1067        .values()
1068        .map(|ks| logsumexp(&ks.iter().map(|&k| sp_eq[k]).collect::<Vec<_>>()))
1069        .collect();
1070    let online_log: Vec<f64> = agg
1071        .values()
1072        .map(|ks| logsumexp(&ks.iter().map(|&k| sp_online[k]).collect::<Vec<_>>()))
1073        .collect();
1074
1075    // eq-class weights = softmax(eq_log)
1076    let maxe = eq_log.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
1077    let mut weights: Vec<f64> = eq_log.iter().map(|&l| (l - maxe).exp()).collect();
1078    let wsum: f64 = weights.iter().sum();
1079    if wsum > 0.0 {
1080        for w in &mut weights {
1081            *w /= wsum;
1082        }
1083    }
1084
1085    // abundance-aware posteriors (online), per distinct transcript
1086    let post: Vec<f64> = match ctx.online {
1087        Some(o) => {
1088            let maps: Vec<(u32, f64)> = tids
1089                .iter()
1090                .cloned()
1091                .zip(online_log.iter().cloned())
1092                .collect();
1093            o.assign_fragment(&maps, ctx.log_fm)
1094        }
1095        None => weights.clone(),
1096    };
1097
1098    // train the error model + collect bias models, weighted by posteriors
1099    let collecting = ctx.online.is_none_or(|o| o.collecting());
1100    if collecting {
1101        for (ti, (tid, ks)) in agg.iter().enumerate() {
1102            let p_tid = post[ti];
1103            if p_tid <= 0.0 {
1104                continue;
1105            }
1106            let online_log_tid = online_log[ti];
1107            let refseq = ctx
1108                .ref_bytes
1109                .get(*tid as usize)
1110                .map(|v| v.as_slice())
1111                .unwrap_or(&[]);
1112            for &k in ks {
1113                let p = p_tid * (sp_online[k] - online_log_tid).exp();
1114                if p <= 0.0 {
1115                    continue;
1116                }
1117                let idxs = &placements[sp_pl[k]].idxs;
1118                if let Some(m) = local.model.as_mut() {
1119                    if !refseq.is_empty() {
1120                        let lw = ctx.log_fm + p.ln();
1121                        for (rank, &i) in idxs.iter().enumerate() {
1122                            let r = &recs[i];
1123                            m.update(&r.read_2bit, refseq, r.pos, &r.ops, rank == 0, lw);
1124                        }
1125                    }
1126                }
1127                if refseq.is_empty() {
1128                    continue;
1129                }
1130                let (fs, fe, proper) = sp_geom[k];
1131                let rl = refseq.len();
1132                // Per-read 5' positions, by each read's actual strand.
1133                let (mut fwd_five, mut rev_five): (Option<usize>, Option<usize>) = (None, None);
1134                if idxs.len() == 1 {
1135                    let r = &recs[idxs[0]];
1136                    if r.is_reverse {
1137                        rev_five = Some(r.five_prime());
1138                    } else {
1139                        fwd_five = Some(r.five_prime());
1140                    }
1141                } else {
1142                    let fwd = idxs.iter().map(|&i| &recs[i]).find(|r| !r.is_reverse);
1143                    let rev = idxs.iter().map(|&i| &recs[i]).find(|r| r.is_reverse);
1144                    if let (Some(fr), Some(rr)) = (fwd, rev) {
1145                        let (fp, rp) = (fr.five_prime(), rr.five_prime());
1146                        if fp < rp {
1147                            fwd_five = Some(fp);
1148                            rev_five = Some(rp);
1149                        }
1150                    }
1151                }
1152                if let Some(obs) = local.seq_obs.as_mut() {
1153                    if let Some(five) = fwd_five {
1154                        let s = five as i32 - CONTEXT_LEFT as i32;
1155                        if s >= 0 && (s as usize + CONTEXT_LENGTH) <= rl {
1156                            obs.0.add_context(
1157                                &refseq[s as usize..s as usize + CONTEXT_LENGTH],
1158                                false,
1159                                p,
1160                            );
1161                        }
1162                    }
1163                    if let Some(five) = rev_five {
1164                        let s = five as i32 - CONTEXT_RIGHT as i32;
1165                        if s >= 0 && (s as usize + CONTEXT_LENGTH) <= rl {
1166                            obs.1.add_context(
1167                                &refseq[s as usize..s as usize + CONTEXT_LENGTH],
1168                                true,
1169                                p,
1170                            );
1171                        }
1172                    }
1173                }
1174                if let (Some(gc), true) = (local.gc_obs.as_mut(), proper && fe < rl) {
1175                    let view = ctx.gc_store.view(*tid as usize);
1176                    if let Some((ff, cf)) = salmon_model::gc_desc(&view, fs as i32, fe as i32) {
1177                        gc.inc(ff, cf, p);
1178                    }
1179                }
1180                if let Some(pos) = local.pos_obs.as_mut() {
1181                    let lc = ctx.length_class.unwrap()[*tid as usize];
1182                    if let Some(five) = fwd_five {
1183                        pos.0[lc].add_mass(five as i32, rl as i32, p.ln());
1184                    }
1185                    if let Some(five) = rev_five {
1186                        pos.1[lc].add_mass(five as i32, rl as i32, p.ln());
1187                    }
1188                }
1189            }
1190        }
1191    }
1192
1193    let group = if ctx.range_factorization_bins > 0 {
1194        let bins = range_factorize_bins(&weights, ctx.range_factorization_bins);
1195        TranscriptGroup::with_bins(tids, bins)
1196    } else {
1197        TranscriptGroup::from_sorted(tids)
1198    };
1199    ctx.eq_builder.add_group(group, weights, 1);
1200}
1201
1202/// Is the input coordinate-sorted and *not* grouped by read name?
1203///
1204/// Read once from the `@HD` header (no per-record cost). `SO:coordinate` orders
1205/// records by position, scattering a read's alignments — unusable here. But a file
1206/// can be coordinate-sorted and then re-grouped by name (`samtools collate`, which
1207/// sets `GO:query`), or carry a stale `SO:coordinate` after several samtools steps;
1208/// the reliable signal that records are usably grouped is `GO:query`. So we only
1209/// reject when coordinate-sorted AND not query-grouped. (Query-name *sorted* files
1210/// report `SO:queryname`, not coordinate, and pass.)
1211fn coordinate_sorted_unusable(header: &noodles_sam::Header) -> bool {
1212    let Some(hd) = header.header() else {
1213        return false;
1214    };
1215    let mut so_coord = false;
1216    let mut go_query = false;
1217    // `SO`/`GO` are non-standard tags in this noodles version → read from other_fields.
1218    for (tag, value) in hd.other_fields() {
1219        let t: &[u8; 2] = tag.as_ref();
1220        let v: &[u8] = value.as_ref();
1221        if t == b"SO" {
1222            so_coord = v == &b"coordinate"[..];
1223        } else if t == b"GO" {
1224            go_query = v == &b"query"[..];
1225        }
1226    }
1227    so_coord && !go_query
1228}
1229
1230/// Run alignment-based quantification end-to-end.
1231pub fn quantify_alignments(opts: &AlignQuantOptions) -> Result<AlignQuantResult> {
1232    let start_time = asctime_now();
1233    let header = read_alignment_header(&opts.bam)?;
1234
1235    // Reject coordinate-sorted input up front (header-only check, no per-record cost):
1236    // alignment-mode requires all records of a read/pair to be adjacent (grouped by
1237    // read name), which a coordinate-sorted file violates.
1238    anyhow::ensure!(
1239        !coordinate_sorted_unusable(&header),
1240        "the input BAM/SAM appears to be coordinate-sorted (@HD SO:coordinate) and is not \
1241         grouped by read name (GO:query). Alignment-mode quantification requires that all \
1242         alignment records of a read (or read pair) are adjacent in the file. Please collate \
1243         it by read name first, e.g. `samtools collate` or `samtools sort -n`."
1244    );
1245
1246    // References (transcripts) in @SQ order define the transcript ids.
1247    let names: Vec<String> = header
1248        .reference_sequences()
1249        .keys()
1250        .map(|k| String::from_utf8_lossy(k.as_ref()).into_owned())
1251        .collect();
1252    let lengths: Vec<u32> = header
1253        .reference_sequences()
1254        .values()
1255        .map(|rs| rs.length().get() as u32)
1256        .collect();
1257    let num_refs = names.len();
1258    anyhow::ensure!(num_refs > 0, "BAM header has no reference sequences");
1259
1260    let eq_builder = EquivalenceClassBuilder::new();
1261    let mut fld =
1262        FragmentLengthDistribution::new(1.0, opts.fld_max, opts.fld_mean, opts.fld_sd, 4, 0.5, 1);
1263
1264    // The error model and bias models need the transcriptome (salmon requires `-t`).
1265    let use_error_model = opts.transcripts.is_some() && !opts.no_error_model;
1266    let bias_on = opts.seq_bias || opts.gc_bias || opts.pos_bias;
1267    anyhow::ensure!(
1268        !bias_on || opts.transcripts.is_some(),
1269        "--seqBias/--gcBias/--posBias in alignment mode require -t/--targets (the transcriptome FASTA)"
1270    );
1271    let ref_bytes: Vec<Vec<u8>> = if use_error_model || bias_on {
1272        load_ref_bytes(opts.transcripts.as_ref().unwrap(), &names)?
1273    } else {
1274        Vec::new()
1275    };
1276
1277    // Online (dual-phase) abundances: develop running estimates so the error
1278    // model and bias models are trained/collected with abundance-aware posteriors
1279    // in a single streaming pass (salmon's online phase), rather than two passes.
1280    let ref_lens_u64: Vec<u64> = lengths.iter().map(|&l| l as u64).collect();
1281    let online = (use_error_model || bias_on).then(|| {
1282        salmon_infer::OnlineInference::new(
1283            &ref_lens_u64,
1284            0.05,
1285            opts.forgetting_factor,
1286            opts.num_aux_model_samples,
1287        )
1288    });
1289
1290    // Per-transcript inputs the bias collection needs (the observed bias models
1291    // themselves are accumulated per-worker inside the pass).
1292    use salmon_model::seqbias::CONTEXT_LENGTH;
1293    // GC cumulative-count backing: one rank bitvector over the concatenated
1294    // references (salmon's `--reduceGCMemory`, now the default — faster and ~2x
1295    // leaner than dense per-transcript prefixes, identical results). `gc_store`
1296    // presents per-transcript `GcView`s.
1297    let (gc_rank, gc_offsets): (Option<salmon_model::GcRank>, Vec<u64>) = if opts.gc_bias {
1298        let mut concat: Vec<u8> = Vec::new();
1299        let mut offs: Vec<u64> = Vec::with_capacity(ref_bytes.len() + 1);
1300        offs.push(0);
1301        for s in &ref_bytes {
1302            concat.extend_from_slice(s);
1303            offs.push(concat.len() as u64);
1304        }
1305        (Some(salmon_model::GcRank::new(&concat)), offs)
1306    } else {
1307        (None, Vec::new())
1308    };
1309    let gc_store = match &gc_rank {
1310        Some(r) => salmon_model::GcStore::Rank {
1311            rank: r,
1312            offsets: &gc_offsets,
1313        },
1314        None => salmon_model::GcStore::Dense(&[]),
1315    };
1316    let length_quantiles: Option<Vec<u32>> = opts.pos_bias.then(|| {
1317        salmon_model::compute_length_quantiles(&lengths, salmon_model::NUM_LENGTH_CLASSES)
1318    });
1319    let length_class: Option<Vec<usize>> = length_quantiles.as_ref().map(|q| {
1320        lengths
1321            .iter()
1322            .map(|&l| salmon_model::length_class_index(q, l))
1323            .collect()
1324    });
1325
1326    // ---- online pass (reader/worker pipeline) ------------------------------
1327    // A dedicated reader thread groups raw records by name; the worker pool
1328    // converts (record_to_frag -- the bulk of per-record cost) and weights them
1329    // in parallel. Per-thread error-model/bias deltas merge into the globals
1330    // between minibatches (salmon's minibatch staleness).
1331    const MINIBATCH: usize = 1000;
1332    // A paired library expects two mates; a single mate to a transcript is then
1333    // an "unexpected orphan" and is fragment-length-penalized. (Single-end libs
1334    // aren't.)
1335    let paired_lib = !matches!(opts.lib_type.as_str(), "U" | "SF" | "SR" | "S");
1336    // Orientation-compatibility filtering (salmon): drop alignments whose
1337    // orientation is incompatible with the expected library type. Skipped under
1338    // auto (`A`) library type.
1339    let expected_format = match opts.lib_type.as_str() {
1340        "A" => None,
1341        s => LibraryFormat::parse(s).ok(),
1342    };
1343    let ignore_incompat = opts.incompat_prior <= 0.0;
1344    let nthreads = rayon::current_num_threads().max(1);
1345
1346    // The bias accumulators + counters are developed inside the pass; the model
1347    // is trained there too but not needed afterward. Scope `cfg`/`acc` so their
1348    // borrows (fld, eq_builder, online, ...) are released before the post-pass
1349    // effective-length / EM work below.
1350    let (seq_obs, gc_obs, pos_obs, num_processed, num_mapped) = {
1351        let cfg = PassCfg {
1352            online: online.as_ref(),
1353            fld: &fld,
1354            eq_builder: &eq_builder,
1355            ref_bytes: &ref_bytes,
1356            lengths: &lengths,
1357            gc_store,
1358            length_class: length_class.as_deref(),
1359            expected_format,
1360            ignore_incompat,
1361            incompat_prior: opts.incompat_prior,
1362            paired_lib,
1363            discard_orphans: opts.discard_orphans,
1364            pre_burnin: opts.num_pre_aux_model_samples,
1365            range_factorization_bins: opts.range_factorization_bins,
1366            use_error_model,
1367            error_bins: opts.num_error_bins,
1368            seq_bias: opts.seq_bias,
1369            gc_bias: opts.gc_bias,
1370            cond_gc_bins: opts.cond_gc_bins,
1371            gc_bins: opts.gc_bins,
1372            pos_bias: opts.pos_bias,
1373            need_seq: use_error_model || bias_on,
1374            minibatch: MINIBATCH,
1375            nthreads,
1376            progress: opts.progress.as_deref(),
1377        };
1378        let mut acc = PassAccum {
1379            seq_obs: None,
1380            gc_obs: None,
1381            pos_obs: None,
1382            num_processed: 0,
1383            num_mapped: 0,
1384        };
1385        stream_online_pass(&opts.bam, &cfg, &mut acc)?;
1386        (
1387            acc.seq_obs,
1388            acc.gc_obs,
1389            acc.pos_obs,
1390            acc.num_processed,
1391            acc.num_mapped,
1392        )
1393    };
1394
1395    // ---- base effective lengths --------------------------------------------
1396    fld.cache();
1397    let cond_means = fld.conditional_means();
1398    let mut eff_lengths = vec![0f64; num_refs];
1399    for (tid, len) in lengths.iter().enumerate() {
1400        eff_lengths[tid] = salmon_model::smoothed_effective_length(&cond_means, *len as usize);
1401    }
1402
1403    let mut collapsed = eq_builder.finish();
1404    collapsed.update_eff_lengths(&eff_lengths);
1405    let num_eq_classes = collapsed.len();
1406
1407    // Count-blended EM initialization (salmon's `CollapsedEMOptimizer::optimize`):
1408    // seed abundances with a linear combination of the online-phase abundance
1409    // estimates (`projectedCounts`) and the uniform distribution, weighted by the
1410    // fraction of the burn-in target observed. A warm start near the solution
1411    // cuts the number of EM iterations to convergence.
1412    let init_alphas: Option<Vec<f64>> = online.as_ref().map(|o| {
1413        let masses: Vec<f64> = (0..num_refs).map(|t| o.mass_log(t).exp()).collect();
1414        let mass_sum: f64 = masses.iter().sum();
1415        let total_reads = num_mapped as f64;
1416        // online estimates scaled to a proper count distribution (sum = reads)
1417        let projected: Vec<f64> = if mass_sum > 0.0 {
1418            masses.iter().map(|&m| m / mass_sum * total_reads).collect()
1419        } else {
1420            vec![0.0; num_refs]
1421        };
1422        let uniform_prior = if num_refs > 0 {
1423            total_reads / num_refs as f64
1424        } else {
1425            0.0
1426        };
1427        // salmon's numRequiredFragments default (the online-phase target)
1428        const NUM_REQUIRED_FRAGMENTS: f64 = 50_000_000.0;
1429        let frac_observed = (total_reads / NUM_REQUIRED_FRAGMENTS).min(0.999);
1430        projected
1431            .iter()
1432            .map(|&p| p * frac_observed + uniform_prior * (1.0 - frac_observed))
1433            .collect()
1434    });
1435    // `--initUniform` forces the plain uniform EM start; otherwise warm-start
1436    // from the online-estimate-blended init.
1437    let mut em = if opts.skip_quant {
1438        // `--skipQuant`: emit eq-classes + library type + metadata, skip the
1439        // optimizer and quant.sf. Abundances left at zero.
1440        salmon_infer::EmResult {
1441            alphas: vec![0.0; num_refs],
1442            iters: 0,
1443            converged: true,
1444        }
1445    } else if opts.init_uniform {
1446        optimize(&collapsed, num_refs, &opts.em, Some(&eff_lengths))
1447    } else {
1448        optimize_with_init(
1449            &collapsed,
1450            num_refs,
1451            &opts.em,
1452            init_alphas.as_deref(),
1453            Some(&eff_lengths),
1454        )
1455    };
1456
1457    // ---- bias-corrected effective lengths (shared with reads mode) ----------
1458    let mut bias_dump = salmon_model::dumps::BiasDump::default();
1459    if bias_on && !opts.skip_quant {
1460        let log_pmf = fld.log_pmf();
1461        let pmf_lin: Vec<f64> = log_pmf.iter().map(|lp| lp.exp()).collect();
1462        let (fld_cdf, fld_low, fld_high) = salmon_model::seqbias::fld_cdf_and_bounds(&pmf_lin);
1463        let k = if opts.seq_bias { CONTEXT_LENGTH } else { 1 };
1464        let refseq_of = |t: usize| ref_bytes[t].as_slice();
1465
1466        let seq = seq_obs.map(|(mut of, mut or)| {
1467            of.normalize();
1468            or.normalize();
1469            let (ef, er) = salmon_model::build_expected(
1470                num_refs,
1471                refseq_of,
1472                &em.alphas,
1473                &eff_lengths,
1474                &fld_cdf,
1475            );
1476            (of, or, ef, er)
1477        });
1478        if let Some((of, or, ef, er)) = seq.as_ref() {
1479            bias_dump.obs5_seq = of.dump().to_vec();
1480            bias_dump.obs3_seq = or.dump().to_vec();
1481            bias_dump.exp5_seq = ef.dump().to_vec();
1482            bias_dump.exp3_seq = er.dump().to_vec();
1483        }
1484        let gc_ratio_model = if let Some(mut obs) = gc_obs {
1485            let mut exp = salmon_model::build_expected_gc(
1486                num_refs,
1487                refseq_of,
1488                |t| gc_store.view(t),
1489                &em.alphas,
1490                &eff_lengths,
1491                &fld_cdf,
1492                fld_low,
1493                fld_high,
1494                opts.cond_gc_bins,
1495                opts.gc_bins,
1496                k,
1497                opts.bias_speed_samp,
1498            );
1499            obs.normalize();
1500            exp.normalize();
1501            bias_dump.obs_gc = obs.dump().to_vec();
1502            bias_dump.exp_gc = exp.dump().to_vec();
1503            Some(salmon_model::gc_ratio(
1504                &mut obs,
1505                &mut exp,
1506                salmon_model::gcbias::GC_MAX_RATIO,
1507            ))
1508        } else {
1509            None
1510        };
1511        let pos_models = pos_obs.map(|(mut of, mut or)| {
1512            for x in of.iter_mut().chain(or.iter_mut()) {
1513                x.finalize();
1514            }
1515            let (ef, er) = salmon_model::build_expected_pos(
1516                num_refs,
1517                |t| lengths[t] as usize,
1518                &em.alphas,
1519                &eff_lengths,
1520                &fld_cdf,
1521                length_quantiles.as_ref().unwrap(),
1522                k,
1523            );
1524            (of, or, ef, er)
1525        });
1526        if let Some((ofw, orc, efw, erc)) = pos_models.as_ref() {
1527            let masses =
1528                |v: &[salmon_model::SimplePosBias]| v.iter().map(|m| m.masses().to_vec()).collect();
1529            bias_dump.obs5_pos = masses(ofw);
1530            bias_dump.obs3_pos = masses(orc);
1531            bias_dump.exp5_pos = masses(efw);
1532            bias_dump.exp3_pos = masses(erc);
1533        }
1534
1535        for tid in 0..num_refs {
1536            if em.alphas[tid] < 1e-8 {
1537                continue;
1538            }
1539            let s = ref_bytes[tid].as_slice();
1540            let pos_vecs: Option<(Vec<f64>, Vec<f64>)> =
1541                pos_models.as_ref().map(|(ofw, orc, efw, erc)| {
1542                    let lc = length_class.as_ref().unwrap()[tid];
1543                    let rl = s.len();
1544                    let (mut o5, mut e5) = (vec![0.0; rl], vec![0.0; rl]);
1545                    let (mut o3, mut e3) = (vec![0.0; rl], vec![0.0; rl]);
1546                    ofw[lc].project_weights(&mut o5);
1547                    efw[lc].project_weights(&mut e5);
1548                    orc[lc].project_weights(&mut o3);
1549                    erc[lc].project_weights(&mut e3);
1550                    (
1551                        salmon_model::positional_factor(&o5, &e5),
1552                        salmon_model::positional_factor(&o3, &e3),
1553                    )
1554                });
1555            let bias = salmon_model::BiasInputs {
1556                seq: seq.as_ref().map(|(of, or, ef, er)| (of, ef, or, er)),
1557                gc: gc_ratio_model.as_ref().map(|g| (g, gc_store.view(tid))),
1558                pos: pos_vecs
1559                    .as_ref()
1560                    .map(|(pf, pr)| (pf.as_slice(), pr.as_slice())),
1561            };
1562            eff_lengths[tid] = salmon_model::corrected_effective_length_full(
1563                s,
1564                &fld_cdf,
1565                fld_low,
1566                fld_high,
1567                &bias,
1568                eff_lengths[tid],
1569                opts.bias_speed_samp,
1570                opts.no_bias_length_threshold,
1571            );
1572        }
1573        collapsed.update_eff_lengths(&eff_lengths);
1574        em = optimize(&collapsed, num_refs, &opts.em, Some(&eff_lengths));
1575    }
1576    let counts = em.alphas;
1577
1578    let rates: Vec<f64> = (0..num_refs)
1579        .map(|i| {
1580            if eff_lengths[i] > 0.0 {
1581                counts[i] / eff_lengths[i]
1582            } else {
1583                0.0
1584            }
1585        })
1586        .collect();
1587    let rate_sum: f64 = rates.iter().sum();
1588    let tpm: Vec<f64> = rates
1589        .iter()
1590        .map(|r| {
1591            if rate_sum > 0.0 {
1592                r / rate_sum * 1e6
1593            } else {
1594                0.0
1595            }
1596        })
1597        .collect();
1598
1599    let length_classes =
1600        salmon_model::compute_length_quantiles(&lengths, salmon_model::NUM_LENGTH_CLASSES);
1601    let ambig = salmon_infer::ambiguity_counts(&salmon_infer::PackedEqClasses::from_collapsed(
1602        &collapsed, num_refs,
1603    ));
1604    let result = AlignQuantResult {
1605        names,
1606        lengths,
1607        eff_lengths,
1608        tpm,
1609        counts,
1610        num_processed,
1611        num_mapped,
1612        num_eq_classes,
1613        frag_len_mean: fld.mean(),
1614        frag_len_sd: fld.sd(),
1615        length_classes,
1616        frag_len_dist: fld.log_pmf().iter().map(|lp| lp.exp()).collect(),
1617        start_time,
1618        bias_dump,
1619        ambig,
1620    };
1621    write_outputs(opts, &result)?;
1622    Ok(result)
1623}
1624
1625fn write_outputs(opts: &AlignQuantOptions, res: &AlignQuantResult) -> Result<()> {
1626    let dir = &opts.output_dir;
1627    std::fs::create_dir_all(dir.join("aux_info")).context("creating output dirs")?;
1628
1629    // quant.sf (EffectiveLength + NumReads at --sigDigits decimals; TPM at 6).
1630    // Skipped under --skipQuant (no abundances), like salmon.
1631    if !opts.skip_quant {
1632        let sd = opts.sig_digits as usize;
1633        let mut w = std::io::BufWriter::new(std::fs::File::create(dir.join("quant.sf"))?);
1634        writeln!(w, "Name\tLength\tEffectiveLength\tTPM\tNumReads")?;
1635        for i in 0..res.names.len() {
1636            writeln!(
1637                w,
1638                "{}\t{}\t{:.*}\t{:.6}\t{:.*}",
1639                res.names[i], res.lengths[i], sd, res.eff_lengths[i], res.tpm[i], sd, res.counts[i]
1640            )?;
1641        }
1642        w.flush()?;
1643    }
1644
1645    // aux_info/meta_info.json — full field set, matching salmon's alignment mode
1646    // (no index hashes since there is no index; no keep_duplicates).
1647    #[derive(Serialize)]
1648    struct MetaInfo {
1649        salmon_version: String,
1650        samp_type: String,
1651        opt_type: String,
1652        quant_errors: Vec<String>,
1653        num_libraries: usize,
1654        library_types: Vec<String>,
1655        frag_dist_length: usize,
1656        frag_length_mean: f64,
1657        frag_length_sd: f64,
1658        seq_bias_correct: bool,
1659        gc_bias_correct: bool,
1660        num_bias_bins: usize,
1661        mapping_type: String,
1662        // index hashes are empty in alignment mode (no salmon index)
1663        index_seq_hash: String,
1664        index_name_hash: String,
1665        index_seq_hash512: String,
1666        index_name_hash512: String,
1667        index_decoy_seq_hash: String,
1668        index_decoy_name_hash: String,
1669        num_valid_targets: usize,
1670        num_decoy_targets: usize,
1671        num_eq_classes: usize,
1672        serialized_eq_classes: bool,
1673        eq_class_properties: Vec<String>,
1674        length_classes: Vec<u32>,
1675        num_processed: u64,
1676        num_mapped: u64,
1677        num_dovetail_fragments: u64,
1678        num_fragments_filtered_vm: u64,
1679        num_alignments_below_threshold_for_mapped_fragments_vm: u64,
1680        percent_mapped: f64,
1681        num_decoy_fragments: u64,
1682        num_bootstraps: u32,
1683        call: String,
1684        start_time: String,
1685        end_time: String,
1686    }
1687    let pct = if res.num_processed > 0 {
1688        100.0 * res.num_mapped as f64 / res.num_processed as f64
1689    } else {
1690        0.0
1691    };
1692    let eq_class_properties = if opts.range_factorization_bins > 0 {
1693        vec!["range_factorized".to_string(), "gzipped".to_string()]
1694    } else {
1695        vec!["gzipped".to_string()]
1696    };
1697    let meta = MetaInfo {
1698        salmon_version: SALMON_VERSION.to_string(),
1699        samp_type: "none".to_string(),
1700        opt_type: if opts.em.use_vbem { "vb" } else { "em" }.to_string(),
1701        quant_errors: vec![],
1702        num_libraries: 1,
1703        library_types: vec![opts.lib_type.clone()],
1704        frag_dist_length: res.frag_len_dist.len(),
1705        frag_length_mean: res.frag_len_mean,
1706        frag_length_sd: res.frag_len_sd,
1707        seq_bias_correct: opts.seq_bias,
1708        gc_bias_correct: opts.gc_bias,
1709        num_bias_bins: 0,
1710        mapping_type: "alignment".to_string(),
1711        index_seq_hash: String::new(),
1712        index_name_hash: String::new(),
1713        index_seq_hash512: String::new(),
1714        index_name_hash512: String::new(),
1715        index_decoy_seq_hash: String::new(),
1716        index_decoy_name_hash: String::new(),
1717        num_valid_targets: res.names.len(),
1718        num_decoy_targets: 0,
1719        num_eq_classes: res.num_eq_classes,
1720        serialized_eq_classes: false,
1721        eq_class_properties,
1722        length_classes: res.length_classes.clone(),
1723        num_processed: res.num_processed,
1724        num_mapped: res.num_mapped,
1725        num_dovetail_fragments: 0,
1726        num_fragments_filtered_vm: 0,
1727        num_alignments_below_threshold_for_mapped_fragments_vm: 0,
1728        percent_mapped: pct,
1729        num_decoy_fragments: 0,
1730        num_bootstraps: 0,
1731        call: "quant".to_string(),
1732        start_time: res.start_time.clone(),
1733        end_time: asctime_now(),
1734    };
1735    std::fs::write(
1736        dir.join("aux_info").join("meta_info.json"),
1737        serde_json::to_string_pretty(&meta)?,
1738    )?;
1739
1740    // libParams/flenDist.txt, logs/salmon_quant.log, and the aux dumps (shared).
1741    std::fs::create_dir_all(dir.join("libParams")).context("creating libParams")?;
1742    salmon_model::dumps::write_flen_dist(
1743        &dir.join("libParams").join("flenDist.txt"),
1744        &res.frag_len_dist,
1745    )
1746    .context("writing flenDist.txt")?;
1747    salmon_model::dumps::write_fld_dump(&dir.join("aux_info").join("fld.gz"), &res.frag_len_dist)
1748        .context("writing fld.gz")?;
1749    salmon_model::dumps::write_aux_bias_dumps(&dir.join("aux_info"), &res.bias_dump)
1750        .context("writing aux bias dumps")?;
1751    std::fs::create_dir_all(dir.join("logs")).context("creating logs")?;
1752    let log = format!(
1753        "salmon (rust port, alignment mode) v{SALMON_VERSION}\nstart: {}\nend:   {}\n\
1754         library type: {}\nobserved fragments: {}\nmapped fragments:   {}\nmapping rate: {pct:.4}%\n\
1755         number of equivalence classes: {}\nfragment length mean (sd): {:.2} ({:.2})\n",
1756        res.start_time,
1757        asctime_now(),
1758        opts.lib_type,
1759        res.num_processed,
1760        res.num_mapped,
1761        res.num_eq_classes,
1762        res.frag_len_mean,
1763        res.frag_len_sd,
1764    );
1765    std::fs::write(dir.join("logs").join("salmon_quant.log"), log)
1766        .context("writing salmon_quant.log")?;
1767
1768    // aux_info/ambig_info.tsv
1769    {
1770        let (uniq, ambig) = &res.ambig;
1771        let mut w = std::io::BufWriter::new(std::fs::File::create(
1772            dir.join("aux_info").join("ambig_info.tsv"),
1773        )?);
1774        writeln!(w, "UniqueCount\tAmbigCount")?;
1775        for i in 0..uniq.len() {
1776            writeln!(w, "{}\t{}", uniq[i], ambig[i])?;
1777        }
1778        w.flush()?;
1779    }
1780
1781    // cmd_info.json — the invocation record.
1782    #[derive(Serialize)]
1783    struct CmdInfo {
1784        salmon_version: String,
1785        alignments: String,
1786        targets: String,
1787        #[serde(rename = "libType")]
1788        lib_type: String,
1789        output: String,
1790        #[serde(rename = "auxDir")]
1791        aux_dir: String,
1792    }
1793    let cmd = CmdInfo {
1794        salmon_version: SALMON_VERSION.to_string(),
1795        alignments: opts.bam.display().to_string(),
1796        targets: opts
1797            .transcripts
1798            .as_ref()
1799            .map(|p| p.display().to_string())
1800            .unwrap_or_default(),
1801        lib_type: opts.lib_type.clone(),
1802        output: opts.output_dir.display().to_string(),
1803        aux_dir: "aux_info".to_string(),
1804    };
1805    std::fs::write(
1806        dir.join("cmd_info.json"),
1807        serde_json::to_string_pretty(&cmd)?,
1808    )?;
1809    Ok(())
1810}
1811
1812#[cfg(test)]
1813mod tests {
1814    use super::*;
1815
1816    #[test]
1817    fn alignment_score_value_decoding() {
1818        assert_eq!(value_as_i32(&Value::Int32(-12)), Some(-12));
1819        assert_eq!(value_as_i32(&Value::Int8(0)), Some(0));
1820        assert_eq!(value_as_i32(&Value::UInt16(300)), Some(300));
1821        // a non-integer tag value (e.g. a character) has no integer reading
1822        assert_eq!(value_as_i32(&Value::Character(b'A')), None);
1823    }
1824}
salmon_align/lib.rs

salmon_align/
lib.rs