use std::collections::BTreeMap;
use std::io::{self, Write};
use crate::call::features::{stream_features_whole_genome, write_feature_row, FEATURE_HEADER};
use crate::core::{ContigSet, CoreError, WorkingSet};
use crate::genomics::ReferenceView;
use crate::pileup::{PileupColumn, PileupParams, ReadSource, SkipCounts};
pub trait ColumnAnalyzer {
fn header(&self) -> Option<String> {
None
}
fn params(&self) -> BTreeMap<String, String> {
BTreeMap::new()
}
fn on_column(
&mut self,
col: &PileupColumn,
contig: &str,
out: &mut dyn Write,
) -> io::Result<()>;
}
pub fn run_bounded_whole_genome<S: ReadSource>(
analyzer: &mut dyn ColumnAnalyzer,
source: S,
ref_view: &ReferenceView,
contigs: &ContigSet,
pileup_params: PileupParams,
out: &mut dyn Write,
) -> Result<(WorkingSet, SkipCounts), CoreError> {
if let Some(header) = analyzer.header() {
out.write_all(header.as_bytes())?;
}
stream_features_whole_genome(
source,
ref_view,
contigs,
pileup_params,
&mut |col, contig| {
analyzer
.on_column(col, contig, &mut *out)
.map_err(CoreError::from)
},
)
}
#[derive(Debug, Default)]
pub struct FeatureAnalyzer {
rows: u64,
}
impl FeatureAnalyzer {
pub fn rows(&self) -> u64 {
self.rows
}
}
impl ColumnAnalyzer for FeatureAnalyzer {
fn header(&self) -> Option<String> {
Some(format!("{FEATURE_HEADER}\n"))
}
fn params(&self) -> BTreeMap<String, String> {
let mut p = BTreeMap::new();
p.insert("feature_rows".to_string(), self.rows.to_string());
p
}
fn on_column(
&mut self,
col: &PileupColumn,
contig: &str,
out: &mut dyn Write,
) -> io::Result<()> {
write_feature_row(out, contig, col)?;
self.rows += 1;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::call::features::write_feature_header;
use crate::genomics::{GenomeIndex, IndexReader, IndexWriter};
use crate::pileup::SliceSource;
use std::sync::Arc;
fn tmp(suffix: &str) -> std::path::PathBuf {
let nanos = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
let d = std::env::temp_dir().join(format!("rosalind-ck-{suffix}-{nanos}"));
std::fs::create_dir_all(&d).unwrap();
d.join("ref.idx")
}
fn read_at(contig: u32, pos: u32, seq: &[u8]) -> crate::core::AlignedRead {
use crate::core::{CigarOp, CigarOpKind, Position, SamFlags};
crate::core::AlignedRead {
contig,
pos: Position(pos),
mapq: 60,
flags: SamFlags(0),
cigar: vec![CigarOp::new(CigarOpKind::Match, seq.len() as u32)],
seq: Arc::from(seq.to_vec().into_boxed_slice()),
qual: Arc::from(vec![40u8; seq.len()].into_boxed_slice()),
}
}
#[test]
fn feature_analyzer_via_driver_equals_the_direct_features_path() {
let idx = tmp("equiv");
let index = GenomeIndex::from_named_sequences(&[
("chr1".to_string(), b"ACGTACGTACGTACGTACGT".to_vec()),
("chr2".to_string(), b"TTTTGGGGCCCCAAAATTTT".to_vec()),
])
.unwrap();
IndexWriter::create(&idx)
.unwrap()
.write_genome_index(&index)
.unwrap();
let loaded = IndexReader::open(&idx).unwrap();
let rv = loaded.reference_view().unwrap();
let contigs = loaded.contigs();
let reads = vec![
read_at(0, 0, b"ACGTACGT"),
read_at(0, 0, b"ACGTACGT"),
read_at(1, 0, b"TTTTGGGG"),
];
let pp = PileupParams::default();
let mut direct: Vec<u8> = Vec::new();
write_feature_header(&mut direct).unwrap();
let (ws_direct, _) = stream_features_whole_genome(
SliceSource::new(reads.clone()),
&rv,
contigs,
pp.clone(),
&mut |col, name| write_feature_row(&mut direct, name, col).map_err(CoreError::from),
)
.unwrap();
let mut analyzer = FeatureAnalyzer::default();
let mut via_kit: Vec<u8> = Vec::new();
let (ws_kit, _) = run_bounded_whole_genome(
&mut analyzer,
SliceSource::new(reads),
&rv,
contigs,
pp,
&mut via_kit,
)
.unwrap();
assert_eq!(direct, via_kit, "driver output must be byte-identical");
assert_eq!(
ws_direct.bytes, ws_kit.bytes,
"driver must report the same bounded working set"
);
assert!(analyzer.rows() > 0, "analyzer should count its rows");
assert_eq!(
analyzer.params().get("feature_rows").map(String::as_str),
Some(analyzer.rows().to_string().as_str())
);
let _ = std::fs::remove_dir_all(idx.parent().unwrap());
}
#[test]
fn a_custom_analyzer_inherits_the_bounded_working_set() {
struct DepthTrack {
max_depth_seen: u32,
}
impl ColumnAnalyzer for DepthTrack {
fn on_column(
&mut self,
col: &PileupColumn,
_contig: &str,
_out: &mut dyn Write,
) -> io::Result<()> {
self.max_depth_seen = self.max_depth_seen.max(col.depth());
Ok(())
}
}
let idx = tmp("custom");
let index =
GenomeIndex::from_named_sequences(&[("chr1".to_string(), vec![b'A'; 500])]).unwrap();
IndexWriter::create(&idx)
.unwrap()
.write_genome_index(&index)
.unwrap();
let loaded = IndexReader::open(&idx).unwrap();
let rv = loaded.reference_view().unwrap();
let contigs = loaded.contigs();
let pp = PileupParams {
max_depth: Some(8),
..PileupParams::default()
};
let run = |n: usize| -> u64 {
let reads: Vec<_> = (0..n).map(|_| read_at(0, 0, &[b'C'; 50])).collect();
let mut a = DepthTrack { max_depth_seen: 0 };
let mut sink = io::sink();
run_bounded_whole_genome(
&mut a,
SliceSource::new(reads),
&rv,
contigs,
pp.clone(),
&mut sink,
)
.unwrap()
.0
.bytes
};
assert_eq!(
run(20),
run(2000),
"analyzer working set must not grow with reads"
);
let _ = std::fs::remove_dir_all(idx.parent().unwrap());
}
}