redicat_lib/engine/position/
pileup_position.rs

1//! An implementation of `Position` for dealing with pileups.
2//!
3//! This module provides the [`PileupPosition`] struct, which holds detailed information
4//! about a genomic position derived from a pileup. This includes:
5//!
6//! - Reference sequence name
7//! - Position
8//! - Reference base
9//! - Depth information
10//! - Nucleotide counts (A, C, G, T, N)
11//! - Indel counts (insertions, deletions)
12//! - Reference skip counts
13//! - Failed read counts
14//! - Near max depth flag
15//!
16//! The struct provides methods for creating instances from pileup data and updating counts.
17//!
18//! # PileupPosition Structure
19//!
20//! The [`PileupPosition`] struct contains all the information needed to represent
21//! a genomic position in a pileup analysis. It's designed to be serializable for
22//! easy output and implements the [`Position`] trait for compatibility with the
23//! par_granges framework.
24//!
25//! # Usage
26//!
27//! Typically, [`PileupPosition`] instances are created from htslib pileup objects:
28//!
29//! ```rust
30//! // This is typically done within a RegionProcessor implementation
31//! // let position = PileupPosition::from_pileup(pileup, header, read_filter, base_filter);
32//! ```
33
34use crate::core::read_filter::ReadFilter;
35use crate::engine::position::Position;
36use rust_htslib::bam::{
37    self,
38    pileup::{Alignment, Pileup},
39    record::Record,
40    HeaderView,
41};
42use serde::Serialize;
43use smartstring::{alias::String, LazyCompact, SmartString};
44use std::default;
45
46/// Hold all information about a position.
47// NB: The max depth that htslib will return is i32::MAX, and the type of pos for htlib is u32
48// There is no reason to go bigger, for now at least
49#[derive(Debug, Serialize, Default)]
50#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
51pub struct PileupPosition {
52    /// Reference sequence name.
53    #[serde(rename = "CHR")]
54    pub ref_seq: SmartString<LazyCompact>,
55    /// 1-based position in the sequence.
56    pub pos: u32,
57    /// The reference base at this position.
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub ref_base: Option<char>,
60    /// Total depth at this position.
61    pub depth: u32,
62    /// Number of A bases at this position.
63    pub a: u32,
64    /// Number of C bases at this position.
65    pub c: u32,
66    /// Number of G bases at this position.
67    pub g: u32,
68    /// Number of T bases at this position.
69    pub t: u32,
70    /// Number of N bases at this position. Any unrecognized base will be counted as an N.
71    pub n: u32,
72    /// Number of insertions that start to the right of this position.
73    /// Does not count toward depth.
74    pub ins: u32,
75    /// Number of deletions at this position.
76    pub del: u32,
77    /// Number of refskips at this position. Does not count toward depth.
78    pub ref_skip: u32,
79    /// Number of reads failing filters at this position.
80    pub fail: u32,
81    /// Depth is within 1% of max_depth
82    pub near_max_depth: bool,
83}
84
85impl Position for PileupPosition {
86    /// Create a new position for the given ref_seq name.
87    fn new(ref_seq: String, pos: u32) -> Self {
88        PileupPosition {
89            ref_seq: SmartString::from(ref_seq.as_str()),
90            pos,
91            ..default::Default::default()
92        }
93    }
94}
95
96impl PileupPosition {
97    /// Given a record, update the counts at this position
98    #[inline(always)]
99    fn update<F: ReadFilter>(
100        &mut self,
101        alignment: &Alignment,
102        record: &Record,
103        read_filter: &F,
104        base_filter: Option<u8>,
105    ) {
106        // 1. 过滤不满足条件的 read
107        if !read_filter.filter_read(record, Some(alignment)) {
108            self.depth -= 1;
109            self.fail += 1;
110            return;
111        }
112
113        // 2. 优先处理 refskip 和 deletion
114        // 注意:这里的顺序很重要,is_refskip() 为 true 时 is_del() 也可能为 true
115        if alignment.is_refskip() {
116            self.ref_skip += 1;
117            self.depth -= 1; // Refskip 不计入 depth
118        } else if alignment.is_del() {
119            self.del += 1;
120        } else {
121            // 3. 处理匹配的碱基
122            // .expect() 在调试时能提供更清晰的错误信息,发布版中与 unwrap() 无性能差异
123            let qpos = alignment
124                .qpos()
125                .expect("Pileup alignment should have a query position");
126
127            // 使用 map_or 优雅地处理 Option,判断碱基质量是否过低
128            let is_low_qual = base_filter.is_some_and(|cutoff| record.qual()[qpos] < cutoff);
129
130            if is_low_qual {
131                self.n += 1;
132            } else {
133                // 直接对 u8 (字节) 进行匹配,避免了 u8 -> char 的转换
134                match record.seq()[qpos].to_ascii_uppercase() {
135                    b'A' => self.a += 1,
136                    b'C' => self.c += 1,
137                    b'G' => self.g += 1,
138                    b'T' => self.t += 1,
139                    _ => self.n += 1,
140                }
141            }
142
143            // 4. 检查在此位置之后开始的 insertion
144            if let bam::pileup::Indel::Ins(_len) = alignment.indel() {
145                self.ins += 1;
146            }
147        }
148    }
149
150    /// Flag the position when observed depth is within 1% of the configured max depth.
151    #[inline]
152    pub fn mark_near_max_depth(&mut self, max_depth: u32) {
153        self.near_max_depth = false;
154        if max_depth == 0 {
155            return;
156        }
157
158        let combined_depth = self
159            .depth
160            .saturating_add(self.ref_skip)
161            .saturating_add(self.fail);
162        // Use the smaller of the raw pileup depth (including REF_SKIP/FAIL) and the filtered
163        // depth to remain consistent with the exported TSV columns (`DEPTH`, `REF_SKIP`, `FAIL`).
164        let effective_depth = combined_depth.min(self.depth);
165
166        let lhs = (effective_depth as u128).saturating_mul(101);
167        let rhs = (max_depth as u128).saturating_mul(100);
168
169        self.near_max_depth = lhs >= rhs;
170    }
171
172    /// Convert a pileup into a `Position`.
173    ///
174    /// This will walk over each of the alignments and count the number each nucleotide it finds.
175    /// It will also count the number of Ins/Dels/Skips that are at each position.
176    ///
177    /// # Arguments
178    ///
179    /// * `pileup` - a pileup at a genomic position
180    /// * `header` - a headerview for the bam file being read, to get the sequence name
181    /// * `read_filter` - a function to filter out reads, returning false will cause a read to be filtered
182    /// * `base_filter` - an optional base quality score. If Some(number) if the base quality is not >= that number the base is treated as an `N`
183    #[inline]
184    pub fn from_pileup<F: ReadFilter>(
185        pileup: Pileup,
186        header: &bam::HeaderView,
187        read_filter: &F,
188        base_filter: Option<u8>,
189    ) -> Self {
190        let name = Self::compact_refseq(header, pileup.tid());
191        // make output 1-based
192        let mut pos = Self::new(name, pileup.pos());
193        pos.depth = pileup.depth();
194
195        for alignment in pileup.alignments() {
196            let record = alignment.record();
197            Self::update(&mut pos, &alignment, &record, read_filter, base_filter);
198        }
199        pos
200    }
201
202    /// Convert a tid to a [`SmartString<LazyCompact>`].
203    #[inline]
204    pub fn compact_refseq(header: &HeaderView, tid: u32) -> SmartString<LazyCompact> {
205        let name = std::str::from_utf8(header.tid2name(tid)).unwrap();
206        String::from(name)
207    }
208}
209
210#[cfg(test)]
211mod tests {
212
213    use crate::core::read_filter::DefaultReadFilter;
214
215    use super::*;
216    use rust_htslib::bam::{IndexedReader, Read};
217    use std::path::Path;
218
219    #[test]
220    fn mark_near_max_depth_uses_observed_depth() {
221        let mut position = PileupPosition {
222            depth: 99_200,
223            fail: 900,
224            ..Default::default()
225        };
226
227        position.mark_near_max_depth(100_000);
228        assert!(position.near_max_depth);
229
230        position.depth = 100_000;
231        position.mark_near_max_depth(100_000);
232        assert!(position.near_max_depth);
233
234        position.depth = 98_500;
235        position.mark_near_max_depth(100_000);
236        assert!(!position.near_max_depth);
237
238        position.depth = 10;
239        position.mark_near_max_depth(0);
240        assert!(!position.near_max_depth);
241    }
242
243    #[test]
244    fn test_pileup_position_from_bam() {
245        let bam_path = Path::new("test/chr22.bam");
246        // test for chr22:50783283
247        let site_pos = 50783283;
248        // Only run this test if  the file exists
249        if bam_path.exists() {
250            // Open the BAM file
251            let mut bam = IndexedReader::from_path(bam_path).expect("Failed to open BAM file");
252
253            // Set the reference (not needed for this specific BAM)
254
255            bam.fetch(("chr22", site_pos - 1, site_pos))
256                .expect("Failed to fetch region");
257
258            // Get the header
259            let header = bam.header().to_owned();
260            let read_filter = DefaultReadFilter::new(255);
261
262            // Process the pileup
263            for pileup_result in bam.pileup() {
264                let pileup = pileup_result.expect("Failed to get pileup");
265                // Convert to PileupPosition
266                let position = PileupPosition::from_pileup(
267                    pileup,
268                    &header,
269                    &read_filter,
270                    Some(30), // No base quality filtering
271                );
272                if !(position.pos == site_pos - 1) {
273                    continue;
274                }
275                // Print the position information
276                eprintln!("Reference: {}", position.ref_seq);
277                eprintln!("Position: {}", position.pos + 1);
278                eprintln!("Reference base: {:?}", position.ref_base);
279                eprintln!("Depth: {}", position.depth);
280                eprintln!(
281                    "A: {}, C: {}, G: {}, T: {}, N: {} Insertions: {}, Deletions: {}, RefSkips: {}, Fail: {}",
282                    position.a, position.c, position.g, position.t, position.n, position.ins, position.del, position.ref_skip, position.fail
283                );
284                assert_eq!(position.c, 2208);
285                assert_eq!(position.t, 3);
286                eprintln!("Near max depth: {}", position.near_max_depth);
287
288                // We only want the first position
289                // break;
290            }
291        } else {
292            eprintln!("Test BAM file not found, skipping test");
293            // Panic to see if the test is running
294            panic!("Test BAM file not found");
295        }
296    }
297}