#![allow(
clippy::arithmetic_side_effects,
clippy::cast_lossless,
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::doc_markdown,
clippy::indexing_slicing,
clippy::print_stdout,
reason = "example"
)]
use anyhow::Context;
use clap::Parser as _;
use seqair::bam::pileup::PileupOp;
use seqair::reader::{Readers, ResolveTid, SegmentOptions};
use seqair::vcf::{
Alleles, FormatGt, FormatInt, Genotype, InfoInt, Number, OutputFormat, ValueType,
VcfHeaderBuilder, Writer,
record_encoder::{Arr, FormatFieldDef, Gt, InfoFieldDef, Scalar},
};
use seqair_types::{Base, RegionString};
use std::io::{BufWriter, Write};
use std::path::PathBuf;
use std::sync::Arc;
#[derive(Debug, clap::Parser)]
struct Cli {
input: PathBuf,
reference: PathBuf,
#[clap(long, short)]
region: RegionString,
#[clap(long, default_value = "SAMPLE")]
sample: String,
#[clap(long, default_value_t = 5)]
min_depth: u32,
#[clap(long, default_value_t = 0.1)]
min_af: f64,
#[clap(long, default_value_t = 20)]
min_mapq: u8,
#[clap(long, short)]
output: Option<PathBuf>,
}
struct BaseCounts {
counts: [i32; 4],
total: i32,
}
impl BaseCounts {
fn new() -> Self {
Self { counts: [0; 4], total: 0 }
}
fn add(&mut self, base: Base) {
if let Some(idx) = base.known_index() {
self.counts[idx] += 1;
self.total += 1;
}
}
fn best_alt(&self, ref_base: Base) -> Option<(Base, i32)> {
const BASES: [Base; 4] = [Base::A, Base::C, Base::G, Base::T];
let ref_idx = ref_base.known_index()?;
let mut best: Option<(Base, i32)> = None;
for (i, &count) in self.counts.iter().enumerate() {
if i == ref_idx || count == 0 {
continue;
}
match best {
Some((_, prev)) if count <= prev => {}
_ => best = Some((BASES[i], count)),
}
}
best
}
}
fn main() -> anyhow::Result<()> {
let args = Cli::parse();
use seqair::bam::record_store::{CustomizeRecordStore, RecordStore, SlimRecord};
#[derive(Clone)]
struct DropUseless;
impl CustomizeRecordStore for DropUseless {
type Extra = ();
fn filter(&mut self, rec: &SlimRecord, _: &RecordStore<()>) -> bool {
!rec.flags.is_unmapped()
&& !rec.flags.is_secondary()
&& !rec.flags.is_supplementary()
&& !rec.flags.is_failed_qc()
&& !rec.flags.is_duplicate()
}
fn compute(&mut self, _: &SlimRecord, _: &RecordStore<()>) {}
}
let mut readers =
Readers::<DropUseless>::open_customized(&args.input, &args.reference, DropUseless)
.context("could not open BAM + FASTA")?;
let from_bam = VcfHeaderBuilder::from_bam_header(readers.header())
.context("could not build VCF header from BAM")?;
let region_tid = args.region.chromosome.as_str().resolve_tid(readers.header())?;
let region_contig =
from_bam.contig(region_tid).context("contig not registered in VCF header")?.clone();
let mut builder = from_bam.builder.infos();
let dp_info: InfoInt = builder.register_info(&InfoFieldDef::<Scalar<i32>>::new(
"DP",
Number::Count(1),
ValueType::Integer,
"Total depth",
))?;
let af_info: seqair::vcf::InfoFloats =
builder.register_info(&InfoFieldDef::<Arr<f32>>::new(
"AF",
Number::AlternateBases,
ValueType::Float,
"Allele frequency",
))?;
let mut builder = builder.formats();
let gt_fmt: FormatGt = builder.register_format(&FormatFieldDef::<Gt>::new(
"GT",
Number::Count(1),
ValueType::String,
"Genotype",
))?;
let dp_fmt: FormatInt = builder.register_format(&FormatFieldDef::<Scalar<i32>>::new(
"DP",
Number::Count(1),
ValueType::Integer,
"Sample depth",
))?;
let mut builder = builder.samples();
builder.add_sample(&args.sample)?;
let header = Arc::new(builder.build()?);
let output_format = args
.output
.as_ref()
.map(|p| OutputFormat::from_path(p))
.transpose()?
.unwrap_or(OutputFormat::Vcf);
let mut output: Box<dyn Write> = if let Some(ref path) = args.output {
Box::new(BufWriter::new(
std::fs::File::create(path).with_context(|| format!("could not create {path:?}"))?,
))
} else {
Box::new(BufWriter::new(std::io::stdout().lock()))
};
let vcf_writer = Writer::new(&mut output, output_format);
let mut vcf_writer = vcf_writer.write_header(&header)?;
let min_mapq = args.min_mapq;
let plan: Vec<_> = readers.segments(&args.region, SegmentOptions::default())?.collect();
let mut n_calls = 0u32;
for segment in &plan {
let mut pileup = readers.pileup(segment)?;
let core = segment.core_range();
while let Some(column) = pileup.pileups() {
if !core.contains(&column.pos()) {
continue;
}
let ref_base = column.reference_base();
if ref_base == Base::Unknown {
continue;
}
let mut bc = BaseCounts::new();
for aln in column.alignments() {
if aln.mapq < min_mapq {
continue;
}
if let PileupOp::Match { base, .. } | PileupOp::Insertion { base, .. } = &aln.op {
bc.add(*base);
}
}
if bc.total < args.min_depth as i32 {
continue;
}
let Some((alt_base, alt_count)) = bc.best_alt(ref_base) else {
continue;
};
let af = alt_count as f64 / bc.total as f64;
if af < args.min_af {
continue;
}
let pos1 = column.pos().to_one_based().context("position overflow")?;
let alleles = Alleles::snv(ref_base, alt_base)?;
let enc = vcf_writer.begin_record(®ion_contig, pos1, &alleles, None)?;
let mut enc = enc.filter_pass();
dp_info.encode(&mut enc, bc.total);
af_info.encode(&mut enc, &[af as f32]);
let mut enc = enc.begin_samples();
let ref_count = bc.counts[ref_base.known_index().unwrap_or(0)];
let gt = if af >= 0.8 { Genotype::unphased(1, 1) } else { Genotype::unphased(0, 1) };
gt_fmt.encode(&mut enc, &[gt])?;
dp_fmt.encode(&mut enc, &[bc.total])?;
_ = ref_count;
enc.emit()?;
n_calls += 1;
}
}
vcf_writer.finish()?;
output.flush()?;
eprintln!(
"called {n_calls} SNV(s) in {region} (min_depth={}, min_af={:.2})",
args.min_depth,
args.min_af,
region = args.region,
);
Ok(())
}