1use std::collections::BTreeMap;
21use std::io::{self, Write};
22
23use crate::call::features::{stream_features_whole_genome, write_feature_row, FEATURE_HEADER};
24use crate::core::{ContigSet, CoreError, WorkingSet};
25use crate::genomics::ReferenceView;
26use crate::pileup::{PileupColumn, PileupParams, ReadSource, SkipCounts};
27
28pub trait ColumnAnalyzer {
32 fn header(&self) -> Option<String> {
35 None
36 }
37
38 fn params(&self) -> BTreeMap<String, String> {
40 BTreeMap::new()
41 }
42
43 fn on_column(
47 &mut self,
48 col: &PileupColumn,
49 contig: &str,
50 out: &mut dyn Write,
51 ) -> io::Result<()>;
52}
53
54pub fn run_bounded_whole_genome<S: ReadSource>(
62 analyzer: &mut dyn ColumnAnalyzer,
63 source: S,
64 ref_view: &ReferenceView,
65 contigs: &ContigSet,
66 pileup_params: PileupParams,
67 out: &mut dyn Write,
68) -> Result<(WorkingSet, SkipCounts), CoreError> {
69 if let Some(header) = analyzer.header() {
70 out.write_all(header.as_bytes())?;
71 }
72 stream_features_whole_genome(
73 source,
74 ref_view,
75 contigs,
76 pileup_params,
77 &mut |col, contig| {
78 analyzer
79 .on_column(col, contig, &mut *out)
80 .map_err(CoreError::from)
81 },
82 )
83}
84
85#[derive(Debug, Default)]
89pub struct FeatureAnalyzer {
90 rows: u64,
91}
92
93impl FeatureAnalyzer {
94 pub fn rows(&self) -> u64 {
96 self.rows
97 }
98}
99
100impl ColumnAnalyzer for FeatureAnalyzer {
101 fn header(&self) -> Option<String> {
102 Some(format!("{FEATURE_HEADER}\n"))
103 }
104
105 fn params(&self) -> BTreeMap<String, String> {
106 let mut p = BTreeMap::new();
107 p.insert("feature_rows".to_string(), self.rows.to_string());
108 p
109 }
110
111 fn on_column(
112 &mut self,
113 col: &PileupColumn,
114 contig: &str,
115 out: &mut dyn Write,
116 ) -> io::Result<()> {
117 write_feature_row(out, contig, col)?;
118 self.rows += 1;
119 Ok(())
120 }
121}
122
123#[cfg(test)]
124mod tests {
125 use super::*;
126 use crate::call::features::write_feature_header;
127 use crate::genomics::{GenomeIndex, IndexReader, IndexWriter};
128 use crate::pileup::SliceSource;
129 use std::sync::Arc;
130
131 fn tmp(suffix: &str) -> std::path::PathBuf {
132 let nanos = std::time::SystemTime::now()
133 .duration_since(std::time::UNIX_EPOCH)
134 .unwrap()
135 .as_nanos();
136 let d = std::env::temp_dir().join(format!("rosalind-ck-{suffix}-{nanos}"));
137 std::fs::create_dir_all(&d).unwrap();
138 d.join("ref.idx")
139 }
140
141 fn read_at(contig: u32, pos: u32, seq: &[u8]) -> crate::core::AlignedRead {
142 use crate::core::{CigarOp, CigarOpKind, Position, SamFlags};
143 crate::core::AlignedRead {
144 contig,
145 pos: Position(pos),
146 mapq: 60,
147 flags: SamFlags(0),
148 cigar: vec![CigarOp::new(CigarOpKind::Match, seq.len() as u32)],
149 seq: Arc::from(seq.to_vec().into_boxed_slice()),
150 qual: Arc::from(vec![40u8; seq.len()].into_boxed_slice()),
151 }
152 }
153
154 #[test]
158 fn feature_analyzer_via_driver_equals_the_direct_features_path() {
159 let idx = tmp("equiv");
160 let index = GenomeIndex::from_named_sequences(&[
161 ("chr1".to_string(), b"ACGTACGTACGTACGTACGT".to_vec()),
162 ("chr2".to_string(), b"TTTTGGGGCCCCAAAATTTT".to_vec()),
163 ])
164 .unwrap();
165 IndexWriter::create(&idx)
166 .unwrap()
167 .write_genome_index(&index)
168 .unwrap();
169 let loaded = IndexReader::open(&idx).unwrap();
170 let rv = loaded.reference_view().unwrap();
171 let contigs = loaded.contigs();
172 let reads = vec![
173 read_at(0, 0, b"ACGTACGT"),
174 read_at(0, 0, b"ACGTACGT"),
175 read_at(1, 0, b"TTTTGGGG"),
176 ];
177 let pp = PileupParams::default();
178
179 let mut direct: Vec<u8> = Vec::new();
181 write_feature_header(&mut direct).unwrap();
182 let (ws_direct, _) = stream_features_whole_genome(
183 SliceSource::new(reads.clone()),
184 &rv,
185 contigs,
186 pp.clone(),
187 &mut |col, name| write_feature_row(&mut direct, name, col).map_err(CoreError::from),
188 )
189 .unwrap();
190
191 let mut analyzer = FeatureAnalyzer::default();
193 let mut via_kit: Vec<u8> = Vec::new();
194 let (ws_kit, _) = run_bounded_whole_genome(
195 &mut analyzer,
196 SliceSource::new(reads),
197 &rv,
198 contigs,
199 pp,
200 &mut via_kit,
201 )
202 .unwrap();
203
204 assert_eq!(direct, via_kit, "driver output must be byte-identical");
205 assert_eq!(
206 ws_direct.bytes, ws_kit.bytes,
207 "driver must report the same bounded working set"
208 );
209 assert!(analyzer.rows() > 0, "analyzer should count its rows");
210 assert_eq!(
211 analyzer.params().get("feature_rows").map(String::as_str),
212 Some(analyzer.rows().to_string().as_str())
213 );
214
215 let _ = std::fs::remove_dir_all(idx.parent().unwrap());
216 }
217
218 #[test]
221 fn a_custom_analyzer_inherits_the_bounded_working_set() {
222 struct DepthTrack {
223 max_depth_seen: u32,
224 }
225 impl ColumnAnalyzer for DepthTrack {
226 fn on_column(
227 &mut self,
228 col: &PileupColumn,
229 _contig: &str,
230 _out: &mut dyn Write,
231 ) -> io::Result<()> {
232 self.max_depth_seen = self.max_depth_seen.max(col.depth());
233 Ok(())
234 }
235 }
236
237 let idx = tmp("custom");
238 let index =
239 GenomeIndex::from_named_sequences(&[("chr1".to_string(), vec![b'A'; 500])]).unwrap();
240 IndexWriter::create(&idx)
241 .unwrap()
242 .write_genome_index(&index)
243 .unwrap();
244 let loaded = IndexReader::open(&idx).unwrap();
245 let rv = loaded.reference_view().unwrap();
246 let contigs = loaded.contigs();
247 let pp = PileupParams {
248 max_depth: Some(8),
249 ..PileupParams::default()
250 };
251
252 let run = |n: usize| -> u64 {
253 let reads: Vec<_> = (0..n).map(|_| read_at(0, 0, &[b'C'; 50])).collect();
254 let mut a = DepthTrack { max_depth_seen: 0 };
255 let mut sink = io::sink();
256 run_bounded_whole_genome(
257 &mut a,
258 SliceSource::new(reads),
259 &rv,
260 contigs,
261 pp.clone(),
262 &mut sink,
263 )
264 .unwrap()
265 .0
266 .bytes
267 };
268 assert_eq!(
270 run(20),
271 run(2000),
272 "analyzer working set must not grow with reads"
273 );
274
275 let _ = std::fs::remove_dir_all(idx.parent().unwrap());
276 }
277}