Skip to main content

fastqc_rust/utils/
quality_count.rs

1/// Per-position quality score accumulator.
2///
3/// Replicates the logic from `Utilities/QualityCount.java`.
4/// Accumulates quality score counts for a single read position.
5///
6/// JAVA COMPAT: Uses a fixed 150-slot array indexed by ASCII value, matching
7/// the Java `long[] actualCounts = new long[150]` exactly.
8pub struct QualityCount {
9    actual_counts: [u64; 150],
10    total_counts: u64,
11}
12
13impl QualityCount {
14    pub fn new() -> Self {
15        QualityCount {
16            actual_counts: [0u64; 150],
17            total_counts: 0,
18        }
19    }
20
21    /// Record a quality character (raw ASCII value, not offset-adjusted).
22    ///
23    /// Matches `addValue(char c)` which indexes by `(int)c`.
24    pub fn add_value(&mut self, quality_char: u8) {
25        let idx = quality_char as usize;
26        if idx >= self.actual_counts.len() {
27            // Java throws ArrayIndexOutOfBoundsException here, crashing
28            // the run. We clamp to the last slot instead so that corrupt quality chars
29            // don't abort the entire analysis -- the value will be wrong for that
30            // position, but the rest of the file can still be processed.
31            eprintln!(
32                "Warning: quality character '{}' (ASCII {}) exceeds maximum {}; clamping",
33                quality_char as char,
34                idx,
35                self.actual_counts.len() - 1
36            );
37            self.actual_counts[self.actual_counts.len() - 1] += 1;
38            self.total_counts += 1;
39            return;
40        }
41        self.actual_counts[idx] += 1;
42        self.total_counts += 1;
43    }
44
45    pub fn get_total_count(&self) -> u64 {
46        self.total_counts
47    }
48
49    /// Lowest ASCII character with a non-zero count.
50    pub fn get_min_char(&self) -> Option<u8> {
51        for i in 0..self.actual_counts.len() {
52            if self.actual_counts[i] > 0 {
53                return Some(i as u8);
54            }
55        }
56        None
57    }
58
59    /// Highest ASCII character with a non-zero count.
60    pub fn get_max_char(&self) -> Option<u8> {
61        for i in (0..self.actual_counts.len()).rev() {
62            if self.actual_counts[i] > 0 {
63                return Some(i as u8);
64            }
65        }
66        None
67    }
68
69    /// Weighted mean quality score (offset-adjusted).
70    ///
71    /// Matches `getMean(int offset)`. Iterates from `offset` to 149,
72    /// accumulating `count * (index - offset)` and dividing by total count.
73    /// Returns NaN when count is 0 (Java's 0.0/0 behaviour).
74    pub fn get_mean(&self, offset: u8) -> f64 {
75        let mut total: u64 = 0;
76        let mut count: u64 = 0;
77        let off = offset as usize;
78
79        for i in off..self.actual_counts.len() {
80            total += self.actual_counts[i] * (i - off) as u64;
81            count += self.actual_counts[i];
82        }
83
84        // JAVA COMPAT: Java returns ((double)total)/count which is NaN when count == 0.
85        total as f64 / count as f64
86    }
87
88    /// Percentile quality score (offset-adjusted).
89    ///
90    /// Matches `getPercentile(int offset, int percentile)` exactly,
91    /// including the use of integer arithmetic for threshold calculation:
92    ///   total = totalCounts * percentile / 100   (integer division)
93    /// This is critical for byte-exact output matching.
94    pub fn get_percentile(&self, offset: u8, percentile: u8) -> f64 {
95        // JAVA COMPAT: Java uses `long total = totalCounts; total *= percentile; total /= 100;`
96        // which is integer multiplication then integer division (truncating).
97        let mut threshold: u64 = self.total_counts;
98        threshold *= percentile as u64;
99        threshold /= 100;
100
101        let off = offset as usize;
102        let mut count: u64 = 0;
103
104        for i in off..self.actual_counts.len() {
105            count += self.actual_counts[i];
106            if count >= threshold {
107                // JAVA COMPAT: Java returns `(char)(i - offset)` cast to double.
108                // The char-to-double cast in Java just gives the numeric value.
109                return (i - off) as f64;
110            }
111        }
112
113        // JAVA COMPAT: Java returns -1 when no value found (cast to double = -1.0).
114        -1.0
115    }
116}
117
118/// Find the global minimum and maximum quality characters across any iterable
119/// of QualityCount values (owned or references).
120///
121/// Replicates the `calculateOffsets()` pattern used in
122/// PerBaseQualityScores.java and PerTileQualityScores.java.
123/// Returns (0, 0) if no quality data has been recorded.
124///
125/// Accepts `&[QualityCount]`, `&[&QualityCount]`, or any iterator that yields
126/// items borrowable as `&QualityCount`.
127pub fn calculate_offsets<I, Q>(counts: I) -> (u8, u8)
128where
129    I: IntoIterator<Item = Q>,
130    Q: std::borrow::Borrow<QualityCount>,
131{
132    let mut min_char: u8 = 0;
133    let mut max_char: u8 = 0;
134    let mut first = true;
135
136    for item in counts {
137        let qc = item.borrow();
138        if first {
139            if let (Some(lo), Some(hi)) = (qc.get_min_char(), qc.get_max_char()) {
140                min_char = lo;
141                max_char = hi;
142                first = false;
143            }
144        } else {
145            if let Some(mc) = qc.get_min_char() {
146                if mc < min_char {
147                    min_char = mc;
148                }
149            }
150            if let Some(mc) = qc.get_max_char() {
151                if mc > max_char {
152                    max_char = mc;
153                }
154            }
155        }
156    }
157
158    (min_char, max_char)
159}
160
161impl Default for QualityCount {
162    fn default() -> Self {
163        Self::new()
164    }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    #[test]
172    fn test_empty_counts() {
173        let qc = QualityCount::new();
174        assert_eq!(qc.get_total_count(), 0);
175        assert!(qc.get_min_char().is_none());
176        assert!(qc.get_max_char().is_none());
177        assert!(qc.get_mean(33).is_nan());
178    }
179
180    #[test]
181    fn test_single_value() {
182        let mut qc = QualityCount::new();
183        // ASCII 73 = 'I', with Sanger offset 33 this is quality 40
184        qc.add_value(b'I');
185        assert_eq!(qc.get_total_count(), 1);
186        assert_eq!(qc.get_min_char(), Some(b'I'));
187        assert_eq!(qc.get_max_char(), Some(b'I'));
188        assert!((qc.get_mean(33) - 40.0).abs() < f64::EPSILON);
189    }
190
191    #[test]
192    fn test_multiple_values() {
193        let mut qc = QualityCount::new();
194        // Add quality chars for Sanger: offset 33
195        // ASCII 53 = '5' -> quality 20
196        // ASCII 63 = '?' -> quality 30
197        qc.add_value(53);
198        qc.add_value(63);
199        assert_eq!(qc.get_total_count(), 2);
200        assert_eq!(qc.get_min_char(), Some(53));
201        assert_eq!(qc.get_max_char(), Some(63));
202        // Mean = (20 + 30) / 2 = 25.0
203        assert!((qc.get_mean(33) - 25.0).abs() < f64::EPSILON);
204    }
205
206    #[test]
207    fn test_percentile_integer_division() {
208        // This test verifies that the integer division behavior matches Java.
209        // With 1 count: threshold = 1 * 25 / 100 = 0 (integer division).
210        // The loop starts at offset (33). At i=33, count = actual_counts[33] = 0.
211        // Since 0 >= 0 is true, Java returns (char)(33-33) = (char)0 = 0.0.
212        // This is an artifact of the integer division producing threshold=0.
213        let mut qc = QualityCount::new();
214        qc.add_value(b'I'); // ASCII 73
215        let p25 = qc.get_percentile(33, 25);
216        // threshold = 1 * 25 / 100 = 0 in integer division
217        // First iteration: i=33, count=0, 0 >= 0 -> returns (33-33) = 0.0
218        assert!((p25 - 0.0).abs() < f64::EPSILON);
219
220        // The 100th percentile still works correctly:
221        // threshold = 1 * 100 / 100 = 1
222        // Accumulates until i=73 where count=1, 1 >= 1 -> returns (73-33) = 40.0
223        let p100 = qc.get_percentile(33, 100);
224        assert!((p100 - 40.0).abs() < f64::EPSILON);
225    }
226
227    #[test]
228    fn test_percentile_multiple() {
229        let mut qc = QualityCount::new();
230        // 3 values at quality 20 (ASCII 53), 7 values at quality 30 (ASCII 63)
231        for _ in 0..3 {
232            qc.add_value(53);
233        }
234        for _ in 0..7 {
235            qc.add_value(63);
236        }
237        // Median (50th percentile): threshold = 10 * 50 / 100 = 5
238        // Count at 53: 3 (< 5), count at 63: 10 (>= 5) -> quality 30
239        assert!((qc.get_percentile(33, 50) - 30.0).abs() < f64::EPSILON);
240
241        // 25th percentile: threshold = 10 * 25 / 100 = 2
242        // Count at 53: 3 (>= 2) -> quality 20
243        assert!((qc.get_percentile(33, 25) - 20.0).abs() < f64::EPSILON);
244
245        // 90th percentile: threshold = 10 * 90 / 100 = 9
246        // Count at 53: 3 (< 9), count at 63: 10 (>= 9) -> quality 30
247        assert!((qc.get_percentile(33, 90) - 30.0).abs() < f64::EPSILON);
248    }
249
250    #[test]
251    fn test_add_value_overflow_clamps() {
252        let mut qc = QualityCount::new();
253        // Values >= 150 are clamped to the last slot instead of panicking
254        qc.add_value(200);
255        assert_eq!(qc.get_total_count(), 1);
256        // The count should be in the last slot (index 149)
257        assert_eq!(qc.get_max_char(), Some(149));
258    }
259}