Skip to main content

fastqc_rust/modules/
per_tile_quality.rs

1// Per Tile Sequence Quality module
2// Corresponds to Modules/PerTileQualityScores.java
3
4use std::collections::HashMap;
5use std::io;
6
7use crate::config::{Limits, LimitsExt};
8use crate::modules::QCModule;
9use crate::report::charts::tile_graph::{render_tile_graph, TileGraphData};
10use crate::sequence::Sequence;
11use crate::utils::base_group::BaseGroup;
12use crate::utils::format::java_format_double;
13use crate::utils::quality_count::QualityCount;
14use crate::utils::{phred, quality_count};
15
16pub struct PerTileQualityScores {
17    per_tile_quality_counts: HashMap<i32, Vec<QualityCount>>,
18    current_length: usize,
19    total_count: u64,
20    // splitPosition tracks which colon-separated field contains the tile number.
21    // -1 means not yet determined.
22    split_position: i32,
23    ignore_in_report: bool,
24    nogroup: bool,
25    expgroup: bool,
26    min_length: usize,
27    limits: Limits,
28}
29
30impl PerTileQualityScores {
31    pub fn new(limits: &Limits, nogroup: bool, expgroup: bool, min_length: usize) -> Self {
32        PerTileQualityScores {
33            per_tile_quality_counts: HashMap::new(),
34            current_length: 0,
35            total_count: 0,
36            split_position: -1,
37            ignore_in_report: false,
38            nogroup,
39            expgroup,
40            min_length,
41            limits: limits.clone(),
42        }
43    }
44
45    fn calculate(&self) -> Option<TileCalculatedData> {
46        if self.per_tile_quality_counts.is_empty() {
47            return None;
48        }
49
50        // Collect all QualityCount slices across tiles to find global min/max chars
51        let all_counts: Vec<&QualityCount> = self
52            .per_tile_quality_counts
53            .values()
54            .flat_map(|v| v.iter())
55            .collect();
56        let (min_char, _max_char) = quality_count::calculate_offsets(all_counts);
57        // If no quality data, default to Sanger offset (33).
58        let offset = phred::detect(min_char).map(|e| e.offset).unwrap_or(33);
59
60        let groups = BaseGroup::make_base_groups(
61            self.current_length,
62            self.min_length,
63            self.nogroup,
64            self.expgroup,
65        );
66
67        let mut tile_numbers: Vec<i32> = self.per_tile_quality_counts.keys().copied().collect();
68        tile_numbers.sort();
69
70        let mut means = vec![vec![0.0f64; groups.len()]; tile_numbers.len()];
71        let mut x_labels = Vec::with_capacity(groups.len());
72
73        for (t, &tile) in tile_numbers.iter().enumerate() {
74            for (i, group) in groups.iter().enumerate() {
75                if t == 0 {
76                    x_labels.push(group.label());
77                }
78                let min_base = group.lower_count;
79                let max_base = group.upper_count;
80                means[t][i] = self.get_mean(tile, min_base, max_base, offset);
81            }
82        }
83
84        // Normalise by subtracting column averages to show deviations
85        let mut average_qualities_per_group = vec![0.0f64; groups.len()];
86        for tile_means in means.iter().take(tile_numbers.len()) {
87            for (avg, &m) in average_qualities_per_group
88                .iter_mut()
89                .zip(tile_means.iter())
90            {
91                *avg += m;
92            }
93        }
94        for avg in &mut average_qualities_per_group {
95            *avg /= tile_numbers.len() as f64;
96        }
97
98        let mut max_deviation: f64 = 0.0;
99        // subtract per-group averages from each tile's means
100        for (i, &avg) in average_qualities_per_group.iter().enumerate() {
101            for tile_means in means.iter_mut().take(tile_numbers.len()) {
102                tile_means[i] -= avg;
103                if tile_means[i].abs() > max_deviation {
104                    max_deviation = tile_means[i].abs();
105                }
106            }
107        }
108
109        Some(TileCalculatedData {
110            tiles: tile_numbers,
111            means,
112            x_labels,
113            max_deviation,
114        })
115    }
116
117    /// Replicates `getMean(int tile, int minbp, int maxbp, int offset)`.
118    fn get_mean(&self, tile: i32, min_base: usize, max_base: usize, offset: u8) -> f64 {
119        let quality_counts = match self.per_tile_quality_counts.get(&tile) {
120            Some(qc) => qc,
121            None => return 0.0,
122        };
123
124        let mut count = 0;
125        let mut total = 0.0;
126
127        for qc in &quality_counts[min_base..=max_base] {
128            if qc.get_total_count() > 0 {
129                count += 1;
130                total += qc.get_mean(offset);
131            }
132        }
133
134        if count > 0 {
135            total / count as f64
136        } else {
137            0.0
138        }
139    }
140}
141
142impl PerTileQualityScores {
143    fn build_chart_svg(&self) -> Option<String> {
144        let data = self.calculate()?;
145
146        // Color scale max is the error threshold from config
147        let color_scale_max = self.limits.threshold("tile\terror", 5.0);
148
149        Some(render_tile_graph(&TileGraphData {
150            x_labels: data.x_labels,
151            tiles: data.tiles,
152            tile_base_means: data.means,
153            color_scale_max,
154        }))
155    }
156}
157
158impl QCModule for PerTileQualityScores {
159    fn process_sequence(&mut self, sequence: &Sequence) {
160        // Check ignore config on first sequence
161        if self.total_count == 0 && self.limits.is_ignored("tile") {
162            self.ignore_in_report = true;
163        }
164
165        if self.ignore_in_report {
166            return;
167        }
168
169        // Skip zero-length quality strings
170        if sequence.quality.is_empty() {
171            return;
172        }
173
174        self.total_count += 1;
175
176        // Sample all for first 10k reads, then every 10th
177        if self.total_count > 10000 && !self.total_count.is_multiple_of(10) {
178            return;
179        }
180
181        // Parse tile ID from read header.
182        // Use nth() on the split iterator to avoid allocating a Vec per sequence.
183        if self.split_position < 0 {
184            let field_count = sequence.id.split(':').count();
185            if field_count >= 7 {
186                // 1.8+ format, tile at position 4
187                self.split_position = 4;
188            } else if field_count >= 5 {
189                // Older format, tile at position 2
190                self.split_position = 2;
191            } else {
192                // Can't get a tile from this header
193                self.ignore_in_report = true;
194                return;
195            }
196        }
197
198        let tile = match sequence
199            .id
200            .split(':')
201            .nth(self.split_position as usize)
202            .and_then(|f| f.parse::<i32>().ok())
203        {
204            Some(t) => t,
205            None => {
206                self.ignore_in_report = true;
207                return;
208            }
209        };
210
211        let qual = &sequence.quality;
212
213        // Grow all existing tile arrays if quality string is longer
214        if self.current_length < qual.len() {
215            for qc_vec in self.per_tile_quality_counts.values_mut() {
216                qc_vec.resize_with(qual.len(), QualityCount::new);
217            }
218            self.current_length = qual.len();
219        }
220
221        // Add new tile if not seen, with check for too many tiles
222        if !self.per_tile_quality_counts.contains_key(&tile) {
223            if self.per_tile_quality_counts.len() > 2500 {
224                // Too many tiles, give up
225                eprintln!("Too many tiles (>2500) so giving up trying to do per-tile qualities since we're probably parsing the file wrongly");
226                self.ignore_in_report = true;
227                self.per_tile_quality_counts.clear();
228                return;
229            }
230
231            let mut quality_counts = Vec::new();
232            quality_counts.resize_with(self.current_length, QualityCount::new);
233            self.per_tile_quality_counts.insert(tile, quality_counts);
234        }
235
236        let quality_counts = self.per_tile_quality_counts.get_mut(&tile).unwrap();
237
238        for (i, &q) in qual.iter().enumerate() {
239            quality_counts[i].add_value(q);
240        }
241    }
242
243    fn name(&self) -> &str {
244        "Per tile sequence quality"
245    }
246
247    fn description(&self) -> &str {
248        "Shows the per tile Quality scores of all bases at a given position in a sequencing run"
249    }
250
251    fn reset(&mut self) {
252        self.total_count = 0;
253        self.per_tile_quality_counts.clear();
254        self.current_length = 0;
255        self.split_position = -1;
256        self.ignore_in_report = false;
257    }
258
259    fn raises_error(&self) -> bool {
260        let threshold = self.limits.threshold("tile\terror", 5.0);
261        self.calculate()
262            .is_some_and(|data| data.max_deviation > threshold)
263    }
264
265    fn raises_warning(&self) -> bool {
266        let threshold = self.limits.threshold("tile\twarn", 2.0);
267        self.calculate()
268            .is_some_and(|data| data.max_deviation > threshold)
269    }
270
271    fn ignore_filtered_sequences(&self) -> bool {
272        true
273    }
274
275    fn ignore_in_report(&self) -> bool {
276        // Ignore if flagged, configured to ignore, or no data
277        self.ignore_in_report || self.limits.is_ignored("tile") || self.current_length == 0
278    }
279
280    fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
281        let data = match self.calculate() {
282            Some(d) => d,
283            None => return Ok(()),
284        };
285
286        // Header and format match Java's makeReport
287        writeln!(writer, "#Tile\tBase\tMean")?;
288
289        for (t, &tile) in data.tiles.iter().enumerate() {
290            for i in 0..data.means[t].len() {
291                writeln!(
292                    writer,
293                    "{}\t{}\t{}",
294                    tile,
295                    data.x_labels[i],
296                    java_format_double(data.means[t][i]),
297                )?;
298            }
299        }
300
301        Ok(())
302    }
303
304    // Image filename matches Java's "per_tile_quality.png" in Images/
305    fn chart_image_name(&self) -> Option<&str> {
306        Some("per_tile_quality")
307    }
308    fn chart_alt_text(&self) -> Option<&str> {
309        Some("Per tile sequence quality")
310    }
311    fn generate_chart_svg(&self) -> Option<String> {
312        self.build_chart_svg()
313    }
314}
315
316struct TileCalculatedData {
317    tiles: Vec<i32>,
318    means: Vec<Vec<f64>>,
319    x_labels: Vec<String>,
320    max_deviation: f64,
321}