fastqc_rust/utils/quality_count.rs
1/// Per-position quality score accumulator.
2///
3/// Replicates the logic from `Utilities/QualityCount.java`.
4/// Accumulates quality score counts for a single read position.
5///
6/// JAVA COMPAT: Uses a fixed 150-slot array indexed by ASCII value, matching
7/// the Java `long[] actualCounts = new long[150]` exactly.
8pub struct QualityCount {
9 actual_counts: [u64; 150],
10 total_counts: u64,
11}
12
13impl QualityCount {
14 pub fn new() -> Self {
15 QualityCount {
16 actual_counts: [0u64; 150],
17 total_counts: 0,
18 }
19 }
20
21 /// Record a quality character (raw ASCII value, not offset-adjusted).
22 ///
23 /// Matches `addValue(char c)` which indexes by `(int)c`.
24 pub fn add_value(&mut self, quality_char: u8) {
25 let idx = quality_char as usize;
26 if idx >= self.actual_counts.len() {
27 // Java throws ArrayIndexOutOfBoundsException here, crashing
28 // the run. We clamp to the last slot instead so that corrupt quality chars
29 // don't abort the entire analysis -- the value will be wrong for that
30 // position, but the rest of the file can still be processed.
31 eprintln!(
32 "Warning: quality character '{}' (ASCII {}) exceeds maximum {}; clamping",
33 quality_char as char,
34 idx,
35 self.actual_counts.len() - 1
36 );
37 self.actual_counts[self.actual_counts.len() - 1] += 1;
38 self.total_counts += 1;
39 return;
40 }
41 self.actual_counts[idx] += 1;
42 self.total_counts += 1;
43 }
44
45 pub fn get_total_count(&self) -> u64 {
46 self.total_counts
47 }
48
49 /// Lowest ASCII character with a non-zero count.
50 pub fn get_min_char(&self) -> Option<u8> {
51 for i in 0..self.actual_counts.len() {
52 if self.actual_counts[i] > 0 {
53 return Some(i as u8);
54 }
55 }
56 None
57 }
58
59 /// Highest ASCII character with a non-zero count.
60 pub fn get_max_char(&self) -> Option<u8> {
61 for i in (0..self.actual_counts.len()).rev() {
62 if self.actual_counts[i] > 0 {
63 return Some(i as u8);
64 }
65 }
66 None
67 }
68
69 /// Weighted mean quality score (offset-adjusted).
70 ///
71 /// Matches `getMean(int offset)`. Iterates from `offset` to 149,
72 /// accumulating `count * (index - offset)` and dividing by total count.
73 /// Returns NaN when count is 0 (Java's 0.0/0 behaviour).
74 pub fn get_mean(&self, offset: u8) -> f64 {
75 let mut total: u64 = 0;
76 let mut count: u64 = 0;
77 let off = offset as usize;
78
79 for i in off..self.actual_counts.len() {
80 total += self.actual_counts[i] * (i - off) as u64;
81 count += self.actual_counts[i];
82 }
83
84 // JAVA COMPAT: Java returns ((double)total)/count which is NaN when count == 0.
85 total as f64 / count as f64
86 }
87
88 /// Percentile quality score (offset-adjusted).
89 ///
90 /// Matches `getPercentile(int offset, int percentile)` exactly,
91 /// including the use of integer arithmetic for threshold calculation:
92 /// total = totalCounts * percentile / 100 (integer division)
93 /// This is critical for byte-exact output matching.
94 pub fn get_percentile(&self, offset: u8, percentile: u8) -> f64 {
95 // JAVA COMPAT: Java uses `long total = totalCounts; total *= percentile; total /= 100;`
96 // which is integer multiplication then integer division (truncating).
97 let mut threshold: u64 = self.total_counts;
98 threshold *= percentile as u64;
99 threshold /= 100;
100
101 let off = offset as usize;
102 let mut count: u64 = 0;
103
104 for i in off..self.actual_counts.len() {
105 count += self.actual_counts[i];
106 if count >= threshold {
107 // JAVA COMPAT: Java returns `(char)(i - offset)` cast to double.
108 // The char-to-double cast in Java just gives the numeric value.
109 return (i - off) as f64;
110 }
111 }
112
113 // JAVA COMPAT: Java returns -1 when no value found (cast to double = -1.0).
114 -1.0
115 }
116}
117
118/// Find the global minimum and maximum quality characters across any iterable
119/// of QualityCount values (owned or references).
120///
121/// Replicates the `calculateOffsets()` pattern used in
122/// PerBaseQualityScores.java and PerTileQualityScores.java.
123/// Returns (0, 0) if no quality data has been recorded.
124///
125/// Accepts `&[QualityCount]`, `&[&QualityCount]`, or any iterator that yields
126/// items borrowable as `&QualityCount`.
127pub fn calculate_offsets<I, Q>(counts: I) -> (u8, u8)
128where
129 I: IntoIterator<Item = Q>,
130 Q: std::borrow::Borrow<QualityCount>,
131{
132 let mut min_char: u8 = 0;
133 let mut max_char: u8 = 0;
134 let mut first = true;
135
136 for item in counts {
137 let qc = item.borrow();
138 if first {
139 if let (Some(lo), Some(hi)) = (qc.get_min_char(), qc.get_max_char()) {
140 min_char = lo;
141 max_char = hi;
142 first = false;
143 }
144 } else {
145 if let Some(mc) = qc.get_min_char() {
146 if mc < min_char {
147 min_char = mc;
148 }
149 }
150 if let Some(mc) = qc.get_max_char() {
151 if mc > max_char {
152 max_char = mc;
153 }
154 }
155 }
156 }
157
158 (min_char, max_char)
159}
160
161impl Default for QualityCount {
162 fn default() -> Self {
163 Self::new()
164 }
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170
171 #[test]
172 fn test_empty_counts() {
173 let qc = QualityCount::new();
174 assert_eq!(qc.get_total_count(), 0);
175 assert!(qc.get_min_char().is_none());
176 assert!(qc.get_max_char().is_none());
177 assert!(qc.get_mean(33).is_nan());
178 }
179
180 #[test]
181 fn test_single_value() {
182 let mut qc = QualityCount::new();
183 // ASCII 73 = 'I', with Sanger offset 33 this is quality 40
184 qc.add_value(b'I');
185 assert_eq!(qc.get_total_count(), 1);
186 assert_eq!(qc.get_min_char(), Some(b'I'));
187 assert_eq!(qc.get_max_char(), Some(b'I'));
188 assert!((qc.get_mean(33) - 40.0).abs() < f64::EPSILON);
189 }
190
191 #[test]
192 fn test_multiple_values() {
193 let mut qc = QualityCount::new();
194 // Add quality chars for Sanger: offset 33
195 // ASCII 53 = '5' -> quality 20
196 // ASCII 63 = '?' -> quality 30
197 qc.add_value(53);
198 qc.add_value(63);
199 assert_eq!(qc.get_total_count(), 2);
200 assert_eq!(qc.get_min_char(), Some(53));
201 assert_eq!(qc.get_max_char(), Some(63));
202 // Mean = (20 + 30) / 2 = 25.0
203 assert!((qc.get_mean(33) - 25.0).abs() < f64::EPSILON);
204 }
205
206 #[test]
207 fn test_percentile_integer_division() {
208 // This test verifies that the integer division behavior matches Java.
209 // With 1 count: threshold = 1 * 25 / 100 = 0 (integer division).
210 // The loop starts at offset (33). At i=33, count = actual_counts[33] = 0.
211 // Since 0 >= 0 is true, Java returns (char)(33-33) = (char)0 = 0.0.
212 // This is an artifact of the integer division producing threshold=0.
213 let mut qc = QualityCount::new();
214 qc.add_value(b'I'); // ASCII 73
215 let p25 = qc.get_percentile(33, 25);
216 // threshold = 1 * 25 / 100 = 0 in integer division
217 // First iteration: i=33, count=0, 0 >= 0 -> returns (33-33) = 0.0
218 assert!((p25 - 0.0).abs() < f64::EPSILON);
219
220 // The 100th percentile still works correctly:
221 // threshold = 1 * 100 / 100 = 1
222 // Accumulates until i=73 where count=1, 1 >= 1 -> returns (73-33) = 40.0
223 let p100 = qc.get_percentile(33, 100);
224 assert!((p100 - 40.0).abs() < f64::EPSILON);
225 }
226
227 #[test]
228 fn test_percentile_multiple() {
229 let mut qc = QualityCount::new();
230 // 3 values at quality 20 (ASCII 53), 7 values at quality 30 (ASCII 63)
231 for _ in 0..3 {
232 qc.add_value(53);
233 }
234 for _ in 0..7 {
235 qc.add_value(63);
236 }
237 // Median (50th percentile): threshold = 10 * 50 / 100 = 5
238 // Count at 53: 3 (< 5), count at 63: 10 (>= 5) -> quality 30
239 assert!((qc.get_percentile(33, 50) - 30.0).abs() < f64::EPSILON);
240
241 // 25th percentile: threshold = 10 * 25 / 100 = 2
242 // Count at 53: 3 (>= 2) -> quality 20
243 assert!((qc.get_percentile(33, 25) - 20.0).abs() < f64::EPSILON);
244
245 // 90th percentile: threshold = 10 * 90 / 100 = 9
246 // Count at 53: 3 (< 9), count at 63: 10 (>= 9) -> quality 30
247 assert!((qc.get_percentile(33, 90) - 30.0).abs() < f64::EPSILON);
248 }
249
250 #[test]
251 fn test_add_value_overflow_clamps() {
252 let mut qc = QualityCount::new();
253 // Values >= 150 are clamped to the last slot instead of panicking
254 qc.add_value(200);
255 assert_eq!(qc.get_total_count(), 1);
256 // The count should be in the last slot (index 149)
257 assert_eq!(qc.get_max_char(), Some(149));
258 }
259}