Skip to main content

fastqc_rust/modules/
basic_stats.rs

1// Basic Statistics module
2// Corresponds to Modules/BasicStats.java
3
4use std::io;
5
6use crate::config::Limits;
7use crate::modules::QCModule;
8use crate::sequence::Sequence;
9use crate::utils::base_counts::{BASE_INDEX, IDX_A, IDX_C, IDX_G, IDX_N, IDX_T};
10use crate::utils::phred;
11
12pub struct BasicStats {
13    name: Option<String>,
14    actual_count: u64,
15    filtered_count: u64,
16    min_length: usize,
17    max_length: usize,
18    total_bases: u64,
19    g_count: u64,
20    c_count: u64,
21    a_count: u64,
22    t_count: u64,
23    n_count: u64,
24    // Java initialises lowestChar to 126 (char), which is the highest
25    // printable ASCII. We use Option to represent "no quality chars seen yet".
26    lowest_char: u8,
27    file_type: Option<String>,
28}
29
30impl BasicStats {
31    pub fn new(_limits: &Limits) -> Self {
32        BasicStats {
33            name: None,
34            actual_count: 0,
35            filtered_count: 0,
36            min_length: 0,
37            max_length: 0,
38            total_bases: 0,
39            g_count: 0,
40            c_count: 0,
41            a_count: 0,
42            t_count: 0,
43            n_count: 0,
44            // Java starts at 126 (char), we mirror that
45            lowest_char: 126,
46            file_type: None,
47        }
48    }
49
50    /// Set the filename, stripping any "stdin:" prefix.
51    ///
52    /// Matches `setFileName()` which strips "stdin:" prefix.
53    pub fn set_file_name(&mut self, name: &str) {
54        let name = name.strip_prefix("stdin:").unwrap_or(name);
55        self.name = Some(name.to_string());
56    }
57
58    /// Format a base count into a human-readable string.
59    ///
60    /// Replicates `BasicStats.formatLength(long)` exactly, including
61    /// its custom decimal truncation logic (keeps at most 1 non-zero decimal digit).
62    pub fn format_length(original_length: u64) -> String {
63        let mut length = original_length as f64;
64        let unit;
65
66        if length >= 1_000_000_000.0 {
67            length /= 1_000_000_000.0;
68            unit = " Gbp";
69        } else if length >= 1_000_000.0 {
70            length /= 1_000_000.0;
71            unit = " Mbp";
72        } else if length >= 1_000.0 {
73            length /= 1_000.0;
74            unit = " kbp";
75        } else {
76            unit = " bp";
77        }
78
79        // JAVA COMPAT: Java builds `"" + length` which calls Double.toString(),
80        // then applies a custom truncation: find the dot, keep one more char if
81        // it's non-zero, otherwise drop the dot.
82        let raw = format!("{}", length);
83        let chars: Vec<char> = raw.chars().collect();
84
85        let mut last_index = 0;
86
87        // Find the dot
88        for (i, &ch) in chars.iter().enumerate() {
89            last_index = i;
90            if ch == '.' {
91                break;
92            }
93        }
94
95        // Keep next char if non-zero
96        if last_index + 1 < chars.len() && chars[last_index + 1] != '0' {
97            last_index += 1;
98        } else if last_index > 0 && chars[last_index] == '.' {
99            // Lose the dot if it would be the last character
100            last_index -= 1;
101        }
102
103        let truncated: String = chars[..=last_index].iter().collect();
104        format!("{}{}", truncated, unit)
105    }
106}
107
108impl QCModule for BasicStats {
109    fn process_sequence(&mut self, sequence: &Sequence) {
110        // Java counts filtered sequences separately
111        if sequence.is_filtered {
112            self.filtered_count += 1;
113            return;
114        }
115
116        self.actual_count += 1;
117        self.total_bases += sequence.sequence.len() as u64;
118
119        if self.file_type.is_none() {
120            self.file_type = if sequence.colorspace.is_some() {
121                Some("Colorspace converted to bases".to_string())
122            } else {
123                Some("Conventional base calls".to_string())
124            };
125        }
126
127        // min/max length initialised on first non-filtered sequence
128        let len = sequence.sequence.len();
129        if self.actual_count == 1 {
130            self.min_length = len;
131            self.max_length = len;
132        } else {
133            self.min_length = self.min_length.min(len);
134            self.max_length = self.max_length.max(len);
135        }
136
137        // Use lookup table to avoid branch misprediction on random DNA data
138        let mut counts = [0u64; 6];
139        for &b in &sequence.sequence {
140            counts[BASE_INDEX[b as usize] as usize] += 1;
141        }
142        self.a_count += counts[IDX_A];
143        self.c_count += counts[IDX_C];
144        self.g_count += counts[IDX_G];
145        self.t_count += counts[IDX_T];
146        self.n_count += counts[IDX_N];
147
148        for &q in &sequence.quality {
149            if q < self.lowest_char {
150                self.lowest_char = q;
151            }
152        }
153    }
154
155    fn set_filename(&mut self, name: &str) {
156        self.set_file_name(name);
157    }
158
159    fn name(&self) -> &str {
160        "Basic Statistics"
161    }
162
163    fn description(&self) -> &str {
164        "Calculates some basic statistics about the file"
165    }
166
167    fn reset(&mut self) {
168        self.min_length = 0;
169        self.max_length = 0;
170        self.g_count = 0;
171        self.c_count = 0;
172        self.a_count = 0;
173        self.t_count = 0;
174        self.n_count = 0;
175    }
176
177    // BasicStats never raises error or warning
178    fn raises_error(&self) -> bool {
179        false
180    }
181
182    fn raises_warning(&self) -> bool {
183        false
184    }
185
186    fn ignore_filtered_sequences(&self) -> bool {
187        // BasicStats processes filtered sequences (to count them)
188        false
189    }
190
191    fn ignore_in_report(&self) -> bool {
192        false
193    }
194
195    fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
196        // Header row matches writeTextTable output from AbstractQCModule
197        writeln!(writer, "#Measure\tValue")?;
198
199        // Row 0: Filename
200        writeln!(writer, "Filename\t{}", self.name.as_deref().unwrap_or(""))?;
201
202        // Row 1: File type
203        writeln!(
204            writer,
205            "File type\t{}",
206            self.file_type
207                .as_deref()
208                .unwrap_or("Conventional base calls")
209        )?;
210
211        // Row 2: Encoding
212        // Uses PhredEncoding.getFastQEncodingOffset(lowestChar)
213        let encoding_name = phred::detect(self.lowest_char)
214            .map(|e| e.name.to_string())
215            .unwrap_or_else(|_| "Unknown".to_string());
216        writeln!(writer, "Encoding\t{}", encoding_name)?;
217
218        // Row 3: Total Sequences
219        writeln!(writer, "Total Sequences\t{}", self.actual_count)?;
220
221        // Row 4: Total Bases
222        writeln!(
223            writer,
224            "Total Bases\t{}",
225            Self::format_length(self.total_bases)
226        )?;
227
228        // Row 5: Sequences flagged as poor quality
229        writeln!(
230            writer,
231            "Sequences flagged as poor quality\t{}",
232            self.filtered_count
233        )?;
234
235        // Row 6: Sequence length
236        if self.min_length == self.max_length {
237            writeln!(writer, "Sequence length\t{}", self.min_length)?;
238        } else {
239            writeln!(
240                writer,
241                "Sequence length\t{}-{}",
242                self.min_length, self.max_length
243            )?;
244        }
245
246        // Row 7: %GC
247        // JAVA COMPAT: Integer division: ((gCount+cCount)*100)/(aCount+tCount+gCount+cCount)
248        let total = self.a_count + self.t_count + self.g_count + self.c_count;
249        let gc = ((self.g_count + self.c_count) * 100)
250            .checked_div(total)
251            .unwrap_or(0);
252        writeln!(writer, "%GC\t{}", gc)?;
253
254        Ok(())
255    }
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261
262    #[test]
263    fn test_format_length_bp() {
264        assert_eq!(BasicStats::format_length(16), "16 bp");
265        assert_eq!(BasicStats::format_length(80), "80 bp");
266        assert_eq!(BasicStats::format_length(999), "999 bp");
267    }
268
269    #[test]
270    fn test_format_length_kbp() {
271        assert_eq!(BasicStats::format_length(1000), "1 kbp");
272        assert_eq!(BasicStats::format_length(1500), "1.5 kbp");
273        assert_eq!(BasicStats::format_length(10000), "10 kbp");
274    }
275
276    #[test]
277    fn test_format_length_mbp() {
278        assert_eq!(BasicStats::format_length(1_000_000), "1 Mbp");
279        assert_eq!(BasicStats::format_length(1_200_000), "1.2 Mbp");
280    }
281
282    #[test]
283    fn test_format_length_gbp() {
284        assert_eq!(BasicStats::format_length(1_000_000_000), "1 Gbp");
285    }
286}