Skip to main content

fastqc_rust/modules/
per_base_sequence_content.rs

1// Per Base Sequence Content module
2// Corresponds to Modules/PerBaseSequenceContent.java
3
4use std::io;
5
6use crate::config::{Limits, LimitsExt};
7use crate::modules::QCModule;
8use crate::report::charts::line_graph::{render_line_graph, LineGraphData};
9use crate::sequence::Sequence;
10use crate::utils::base_counts::{BASE_INDEX, IDX_A, IDX_C, IDX_G, IDX_T};
11use crate::utils::base_group::BaseGroup;
12use crate::utils::format::java_format_double;
13
14pub struct PerBaseSequenceContent {
15    /// Per-position base counts stored as [A, C, G, T] per position.
16    /// Using a single Vec of arrays gives better cache locality than
17    /// four separate Vecs, since all counts for a position are adjacent.
18    counts: Vec<[u64; 4]>,
19    nogroup: bool,
20    expgroup: bool,
21    min_length: usize,
22    limits: Limits,
23}
24
25impl PerBaseSequenceContent {
26    pub fn new(limits: &Limits, nogroup: bool, expgroup: bool, min_length: usize) -> Self {
27        PerBaseSequenceContent {
28            counts: Vec::new(),
29            nogroup,
30            expgroup,
31            min_length,
32            limits: limits.clone(),
33        }
34    }
35
36    fn calculate(&self) -> ContentData {
37        let groups = BaseGroup::make_base_groups(
38            self.counts.len(),
39            self.min_length,
40            self.nogroup,
41            self.expgroup,
42        );
43
44        let mut x_categories = Vec::with_capacity(groups.len());
45        let mut g_percent = vec![0.0f64; groups.len()];
46        let mut a_percent = vec![0.0f64; groups.len()];
47        let mut t_percent = vec![0.0f64; groups.len()];
48        let mut c_percent = vec![0.0f64; groups.len()];
49
50        for (i, group) in groups.iter().enumerate() {
51            x_categories.push(group.label());
52
53            let mut a_count: u64 = 0;
54            let mut c_count: u64 = 0;
55            let mut g_count: u64 = 0;
56            let mut t_count: u64 = 0;
57            let mut total: u64 = 0;
58
59            // Java iterates `for (int bp=groups[i].lowerCount()-1;bp<groups[i].upperCount();bp++)`
60            // which is 0-based lowerCount-1 to upperCount-1 inclusive. Our lower_count/upper_count
61            // are already 0-based.
62            for bp in group.lower_count..=group.upper_count {
63                let c = &self.counts[bp];
64                a_count += c[IDX_A];
65                c_count += c[IDX_C];
66                g_count += c[IDX_G];
67                t_count += c[IDX_T];
68                total += c[IDX_A] + c[IDX_C] + c[IDX_G] + c[IDX_T];
69            }
70
71            g_percent[i] = (g_count as f64 / total as f64) * 100.0;
72            a_percent[i] = (a_count as f64 / total as f64) * 100.0;
73            t_percent[i] = (t_count as f64 / total as f64) * 100.0;
74            c_percent[i] = (c_count as f64 / total as f64) * 100.0;
75        }
76
77        // percentages stored in order [T, C, A, G] matching Java's array layout
78        ContentData {
79            x_categories,
80            t_percent,
81            c_percent,
82            a_percent,
83            g_percent,
84        }
85    }
86}
87
88impl PerBaseSequenceContent {
89    fn build_chart_svg(&self) -> String {
90        let data = self.calculate();
91
92        // Series order in LineGraph is [%T, %C, %A, %G], matching Java's
93        // `new LineGraph(percentages, 0d, 100d, ..., new String[] {"%T","%C","%A","%G"}, ...)`
94        render_line_graph(&LineGraphData {
95            data: vec![
96                data.t_percent,
97                data.c_percent,
98                data.a_percent,
99                data.g_percent,
100            ],
101            min_y: 0.0,
102            max_y: 100.0,
103            x_label: "Position in read (bp)".to_string(),
104            series_names: vec![
105                "%T".to_string(),
106                "%C".to_string(),
107                "%A".to_string(),
108                "%G".to_string(),
109            ],
110            x_categories: data.x_categories,
111            title: "Sequence content across all bases".to_string(),
112        })
113    }
114}
115
116impl QCModule for PerBaseSequenceContent {
117    fn process_sequence(&mut self, sequence: &Sequence) {
118        let seq = &sequence.sequence;
119
120        // Grow array if needed
121        if self.counts.len() < seq.len() {
122            self.counts.resize(seq.len(), [0; 4]);
123        }
124
125        // Use lookup table for branchless base classification.
126        // Only A/C/G/T (indices 0-3) are counted; N and others (indices 4-5)
127        // are filtered by the single comparison `idx < 4`.
128        for (i, &b) in seq.iter().enumerate() {
129            let idx = BASE_INDEX[b as usize] as usize;
130            if idx < 4 {
131                self.counts[i][idx] += 1;
132            }
133        }
134    }
135
136    fn name(&self) -> &str {
137        "Per base sequence content"
138    }
139
140    fn description(&self) -> &str {
141        "Shows the relative amounts of each base at each position in a sequencing run"
142    }
143
144    fn reset(&mut self) {
145        self.counts.clear();
146    }
147
148    fn raises_error(&self) -> bool {
149        let error_threshold = self.limits.threshold("sequence\terror", 20.0);
150        let data = self.calculate();
151
152        // Check GC diff (C vs G) and AT diff (T vs A) per group
153        for i in 0..data.g_percent.len() {
154            let gc_diff = (data.c_percent[i] - data.g_percent[i]).abs();
155            let at_diff = (data.t_percent[i] - data.a_percent[i]).abs();
156
157            if gc_diff > error_threshold || at_diff > error_threshold {
158                return true;
159            }
160        }
161        false
162    }
163
164    fn raises_warning(&self) -> bool {
165        let warn_threshold = self.limits.threshold("sequence\twarn", 10.0);
166        let data = self.calculate();
167
168        for i in 0..data.g_percent.len() {
169            let gc_diff = (data.c_percent[i] - data.g_percent[i]).abs();
170            let at_diff = (data.t_percent[i] - data.a_percent[i]).abs();
171
172            if gc_diff > warn_threshold || at_diff > warn_threshold {
173                return true;
174            }
175        }
176        false
177    }
178
179    fn ignore_filtered_sequences(&self) -> bool {
180        true
181    }
182
183    fn ignore_in_report(&self) -> bool {
184        self.limits.is_ignored("sequence")
185    }
186
187    fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
188        let data = self.calculate();
189
190        // Column order is G, A, T, C matching Java's makeReport
191        writeln!(writer, "#Base\tG\tA\tT\tC")?;
192
193        for i in 0..data.x_categories.len() {
194            // Java outputs percentages[3][i] (G), [2][i] (A), [0][i] (T), [1][i] (C)
195            writeln!(
196                writer,
197                "{}\t{}\t{}\t{}\t{}",
198                data.x_categories[i],
199                java_format_double(data.g_percent[i]),
200                java_format_double(data.a_percent[i]),
201                java_format_double(data.t_percent[i]),
202                java_format_double(data.c_percent[i]),
203            )?;
204        }
205
206        Ok(())
207    }
208
209    // Image filename matches Java's "per_base_sequence_content.png" in Images/
210    fn chart_image_name(&self) -> Option<&str> {
211        Some("per_base_sequence_content")
212    }
213    fn chart_alt_text(&self) -> Option<&str> {
214        Some("Per base sequence content")
215    }
216    fn generate_chart_svg(&self) -> Option<String> {
217        Some(self.build_chart_svg())
218    }
219}
220
221struct ContentData {
222    x_categories: Vec<String>,
223    t_percent: Vec<f64>,
224    c_percent: Vec<f64>,
225    a_percent: Vec<f64>,
226    g_percent: Vec<f64>,
227}