redicat_lib/engine/position/pileup_position.rs
1//! An implementation of `Position` for dealing with pileups.
2//!
3//! This module provides the [`PileupPosition`] struct, which holds detailed information
4//! about a genomic position derived from a pileup. This includes:
5//!
6//! - Reference sequence name
7//! - Position
8//! - Reference base
9//! - Depth information
10//! - Nucleotide counts (A, C, G, T, N)
11//! - Indel counts (insertions, deletions)
12//! - Reference skip counts
13//! - Failed read counts
14//! - Near max depth flag
15//!
16//! The struct provides methods for creating instances from pileup data and updating counts.
17//!
18//! # PileupPosition Structure
19//!
20//! The [`PileupPosition`] struct contains all the information needed to represent
21//! a genomic position in a pileup analysis. It's designed to be serializable for
22//! easy output and implements the [`Position`] trait for compatibility with the
23//! par_granges framework.
24//!
25//! # Usage
26//!
27//! Typically, [`PileupPosition`] instances are created from htslib pileup objects:
28//!
29//! ```rust
30//! // This is typically done within a RegionProcessor implementation
31//! // let position = PileupPosition::from_pileup(pileup, header, read_filter, base_filter);
32//! ```
33
34use crate::core::read_filter::ReadFilter;
35use crate::engine::position::Position;
36use rust_htslib::bam::{
37 self,
38 pileup::{Alignment, Pileup},
39 record::Record,
40 HeaderView,
41};
42use serde::Serialize;
43use smartstring::{alias::String, LazyCompact, SmartString};
44use std::default;
45
46/// Hold all information about a position.
47// NB: The max depth that htslib will return is i32::MAX, and the type of pos for htlib is u32
48// There is no reason to go bigger, for now at least
49#[derive(Debug, Serialize, Default)]
50#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
51pub struct PileupPosition {
52 /// Reference sequence name.
53 #[serde(rename = "CHR")]
54 pub ref_seq: SmartString<LazyCompact>,
55 /// 1-based position in the sequence.
56 pub pos: u32,
57 /// The reference base at this position.
58 #[serde(skip_serializing_if = "Option::is_none")]
59 pub ref_base: Option<char>,
60 /// Total depth at this position.
61 pub depth: u32,
62 /// Number of A bases at this position.
63 pub a: u32,
64 /// Number of C bases at this position.
65 pub c: u32,
66 /// Number of G bases at this position.
67 pub g: u32,
68 /// Number of T bases at this position.
69 pub t: u32,
70 /// Number of N bases at this position. Any unrecognized base will be counted as an N.
71 pub n: u32,
72 /// Number of insertions that start to the right of this position.
73 /// Does not count toward depth.
74 pub ins: u32,
75 /// Number of deletions at this position.
76 pub del: u32,
77 /// Number of refskips at this position. Does not count toward depth.
78 pub ref_skip: u32,
79 /// Number of reads failing filters at this position.
80 pub fail: u32,
81 /// Depth is within 1% of max_depth
82 pub near_max_depth: bool,
83}
84
85impl Position for PileupPosition {
86 /// Create a new position for the given ref_seq name.
87 fn new(ref_seq: String, pos: u32) -> Self {
88 PileupPosition {
89 ref_seq: SmartString::from(ref_seq.as_str()),
90 pos,
91 ..default::Default::default()
92 }
93 }
94}
95
96impl PileupPosition {
97 /// Given a record, update the counts at this position
98 #[inline(always)]
99 fn update<F: ReadFilter>(
100 &mut self,
101 alignment: &Alignment,
102 record: &Record,
103 read_filter: &F,
104 base_filter: Option<u8>,
105 ) {
106 // 1. 过滤不满足条件的 read
107 if !read_filter.filter_read(record, Some(alignment)) {
108 self.depth -= 1;
109 self.fail += 1;
110 return;
111 }
112
113 // 2. 优先处理 refskip 和 deletion
114 // 注意:这里的顺序很重要,is_refskip() 为 true 时 is_del() 也可能为 true
115 if alignment.is_refskip() {
116 self.ref_skip += 1;
117 self.depth -= 1; // Refskip 不计入 depth
118 } else if alignment.is_del() {
119 self.del += 1;
120 } else {
121 // 3. 处理匹配的碱基
122 // .expect() 在调试时能提供更清晰的错误信息,发布版中与 unwrap() 无性能差异
123 let qpos = alignment
124 .qpos()
125 .expect("Pileup alignment should have a query position");
126
127 // 使用 map_or 优雅地处理 Option,判断碱基质量是否过低
128 let is_low_qual = base_filter.is_some_and(|cutoff| record.qual()[qpos] < cutoff);
129
130 if is_low_qual {
131 self.n += 1;
132 } else {
133 // 直接对 u8 (字节) 进行匹配,避免了 u8 -> char 的转换
134 match record.seq()[qpos].to_ascii_uppercase() {
135 b'A' => self.a += 1,
136 b'C' => self.c += 1,
137 b'G' => self.g += 1,
138 b'T' => self.t += 1,
139 _ => self.n += 1,
140 }
141 }
142
143 // 4. 检查在此位置之后开始的 insertion
144 if let bam::pileup::Indel::Ins(_len) = alignment.indel() {
145 self.ins += 1;
146 }
147 }
148 }
149
150 /// Flag the position when observed depth is within 1% of the configured max depth.
151 #[inline]
152 pub fn mark_near_max_depth(&mut self, max_depth: u32) {
153 self.near_max_depth = false;
154 if max_depth == 0 {
155 return;
156 }
157
158 let combined_depth = self
159 .depth
160 .saturating_add(self.ref_skip)
161 .saturating_add(self.fail);
162 // Use the smaller of the raw pileup depth (including REF_SKIP/FAIL) and the filtered
163 // depth to remain consistent with the exported TSV columns (`DEPTH`, `REF_SKIP`, `FAIL`).
164 let effective_depth = combined_depth.min(self.depth);
165
166 let lhs = (effective_depth as u128).saturating_mul(101);
167 let rhs = (max_depth as u128).saturating_mul(100);
168
169 self.near_max_depth = lhs >= rhs;
170 }
171
172 /// Convert a pileup into a `Position`.
173 ///
174 /// This will walk over each of the alignments and count the number each nucleotide it finds.
175 /// It will also count the number of Ins/Dels/Skips that are at each position.
176 ///
177 /// # Arguments
178 ///
179 /// * `pileup` - a pileup at a genomic position
180 /// * `header` - a headerview for the bam file being read, to get the sequence name
181 /// * `read_filter` - a function to filter out reads, returning false will cause a read to be filtered
182 /// * `base_filter` - an optional base quality score. If Some(number) if the base quality is not >= that number the base is treated as an `N`
183 #[inline]
184 pub fn from_pileup<F: ReadFilter>(
185 pileup: Pileup,
186 header: &bam::HeaderView,
187 read_filter: &F,
188 base_filter: Option<u8>,
189 ) -> Self {
190 let name = Self::compact_refseq(header, pileup.tid());
191 // make output 1-based
192 let mut pos = Self::new(name, pileup.pos());
193 pos.depth = pileup.depth();
194
195 for alignment in pileup.alignments() {
196 let record = alignment.record();
197 Self::update(&mut pos, &alignment, &record, read_filter, base_filter);
198 }
199 pos
200 }
201
202 /// Convert a tid to a [`SmartString<LazyCompact>`].
203 #[inline]
204 pub fn compact_refseq(header: &HeaderView, tid: u32) -> SmartString<LazyCompact> {
205 let name = std::str::from_utf8(header.tid2name(tid)).unwrap();
206 String::from(name)
207 }
208}
209
210#[cfg(test)]
211mod tests {
212
213 use crate::core::read_filter::DefaultReadFilter;
214
215 use super::*;
216 use rust_htslib::bam::{IndexedReader, Read};
217 use std::path::Path;
218
219 #[test]
220 fn mark_near_max_depth_uses_observed_depth() {
221 let mut position = PileupPosition {
222 depth: 99_200,
223 fail: 900,
224 ..Default::default()
225 };
226
227 position.mark_near_max_depth(100_000);
228 assert!(position.near_max_depth);
229
230 position.depth = 100_000;
231 position.mark_near_max_depth(100_000);
232 assert!(position.near_max_depth);
233
234 position.depth = 98_500;
235 position.mark_near_max_depth(100_000);
236 assert!(!position.near_max_depth);
237
238 position.depth = 10;
239 position.mark_near_max_depth(0);
240 assert!(!position.near_max_depth);
241 }
242
243 #[test]
244 fn test_pileup_position_from_bam() {
245 let bam_path = Path::new("test/chr22.bam");
246 // test for chr22:50783283
247 let site_pos = 50783283;
248 // Only run this test if the file exists
249 if bam_path.exists() {
250 // Open the BAM file
251 let mut bam = IndexedReader::from_path(bam_path).expect("Failed to open BAM file");
252
253 // Set the reference (not needed for this specific BAM)
254
255 bam.fetch(("chr22", site_pos - 1, site_pos))
256 .expect("Failed to fetch region");
257
258 // Get the header
259 let header = bam.header().to_owned();
260 let read_filter = DefaultReadFilter::new(255);
261
262 // Process the pileup
263 for pileup_result in bam.pileup() {
264 let pileup = pileup_result.expect("Failed to get pileup");
265 // Convert to PileupPosition
266 let position = PileupPosition::from_pileup(
267 pileup,
268 &header,
269 &read_filter,
270 Some(30), // No base quality filtering
271 );
272 if !(position.pos == site_pos - 1) {
273 continue;
274 }
275 // Print the position information
276 eprintln!("Reference: {}", position.ref_seq);
277 eprintln!("Position: {}", position.pos + 1);
278 eprintln!("Reference base: {:?}", position.ref_base);
279 eprintln!("Depth: {}", position.depth);
280 eprintln!(
281 "A: {}, C: {}, G: {}, T: {}, N: {} Insertions: {}, Deletions: {}, RefSkips: {}, Fail: {}",
282 position.a, position.c, position.g, position.t, position.n, position.ins, position.del, position.ref_skip, position.fail
283 );
284 assert_eq!(position.c, 2208);
285 assert_eq!(position.t, 3);
286 eprintln!("Near max depth: {}", position.near_max_depth);
287
288 // We only want the first position
289 // break;
290 }
291 } else {
292 eprintln!("Test BAM file not found, skipping test");
293 // Panic to see if the test is running
294 panic!("Test BAM file not found");
295 }
296 }
297}