Skip to main content

fastqc_rust/modules/
sequence_length_distribution.rs

1// Sequence Length Distribution module
2// Corresponds to Modules/SequenceLengthDistribution.java
3
4use std::io;
5
6use crate::config::{Limits, LimitsExt};
7use crate::modules::QCModule;
8use crate::report::charts::line_graph::{render_line_graph, LineGraphData};
9use crate::sequence::Sequence;
10use crate::utils::format::java_format_double;
11
12pub struct SequenceLengthDistribution {
13    /// length_counts[i] = number of sequences with length i
14    length_counts: Vec<u64>,
15    limits: Limits,
16    nogroup: bool,
17    // Lazily computed results
18    computed: Option<ComputedDistribution>,
19}
20
21struct ComputedDistribution {
22    graph_counts: Vec<f64>,
23    x_categories: Vec<String>,
24}
25
26impl SequenceLengthDistribution {
27    pub fn new(limits: &Limits, nogroup: bool) -> Self {
28        SequenceLengthDistribution {
29            length_counts: Vec::new(),
30            limits: limits.clone(),
31            nogroup,
32            computed: None,
33        }
34    }
35
36    /// Find interval and starting point for binning sequence lengths.
37    /// Replicates getSizeDistribution() from SequenceLengthDistribution.java.
38    fn get_size_distribution(min: usize, max: usize, nogroup: bool) -> (usize, usize) {
39        // If nogroup is set, don't bin
40        if nogroup {
41            return (min, 1);
42        }
43
44        // Find the smallest interval from [1,2,5]*base that gives <=50 bins
45        let mut base: usize = 1;
46
47        // The Java code starts with base=1 and has `while (base > (max-min)) base /= 10`
48        // which is a no-op when base starts at 1 (since 1 is always <= max-min for any valid range).
49        // We skip this loop as it has no effect in practice.
50
51        let divisions = [1, 2, 5];
52        let interval;
53
54        'outer: loop {
55            for &d in &divisions {
56                let tester = base * d;
57                if (max - min) / tester <= 50 {
58                    interval = tester;
59                    break 'outer;
60                }
61            }
62            base *= 10;
63        }
64
65        // Calculate starting value aligned to interval boundary
66        let basic_division = min / interval;
67        let starting = basic_division * interval;
68
69        (starting, interval)
70    }
71
72    fn calculate_distribution(&mut self) {
73        if self.computed.is_some() {
74            return;
75        }
76
77        let mut max_len: usize = 0;
78        let mut min_len: Option<usize> = None;
79
80        // Find min and max lengths
81        for i in 0..self.length_counts.len() {
82            if self.length_counts[i] > 0 {
83                if min_len.is_none() {
84                    min_len = Some(i);
85                }
86                max_len = i;
87            }
88        }
89
90        // Default min to 0 if no sequences
91        let mut min_len = min_len.unwrap_or(0);
92
93        // Add one extra category on either side
94        min_len = min_len.saturating_sub(1);
95        max_len += 1;
96
97        let (starting, interval) = Self::get_size_distribution(min_len, max_len, self.nogroup);
98
99        // Count how many categories we need
100        let mut categories = 0;
101        let mut current_value = starting;
102        while current_value <= max_len {
103            categories += 1;
104            current_value += interval;
105        }
106
107        let mut graph_counts = vec![0.0f64; categories];
108        let mut x_categories = Vec::with_capacity(categories);
109
110        // i needed to compute bin boundaries
111        for (i, gc) in graph_counts.iter_mut().enumerate() {
112            let min_value = starting + (interval * i);
113            let mut max_value = (starting + (interval * (i + 1))) - 1;
114
115            // Clamp max_value to maxLen
116            if max_value > max_len {
117                max_value = max_len;
118            }
119
120            // Sum counts in this bin
121            for bp in min_value..=max_value {
122                if bp < self.length_counts.len() {
123                    *gc += self.length_counts[bp] as f64;
124                }
125            }
126
127            // Label format depends on interval
128            if interval == 1 {
129                x_categories.push(format!("{}", min_value));
130            } else {
131                x_categories.push(format!("{}-{}", min_value, max_value));
132            }
133        }
134
135        self.computed = Some(ComputedDistribution {
136            graph_counts,
137            x_categories,
138        });
139    }
140
141    fn ensure_calculated(&self) -> &ComputedDistribution {
142        // SAFETY: finalize() must be called before any reporting method.
143        // If a caller skips finalize(), we provide a static default to avoid panicking.
144        static DEFAULT: ComputedDistribution = ComputedDistribution {
145            graph_counts: Vec::new(),
146            x_categories: Vec::new(),
147        };
148        self.computed.as_ref().unwrap_or(&DEFAULT)
149    }
150}
151
152impl SequenceLengthDistribution {
153    fn build_chart_svg(&self) -> String {
154        let computed = self.ensure_calculated();
155        let max_val = computed
156            .graph_counts
157            .iter()
158            .cloned()
159            .fold(0.0_f64, f64::max);
160        // Java passes raw max to LineGraph (no ceil rounding)
161        let max_y = max_val;
162
163        // Matches Java constructor call
164        render_line_graph(&LineGraphData {
165            data: vec![computed.graph_counts.clone()],
166            min_y: 0.0,
167            max_y,
168            x_label: "Sequence Length (bp)".to_string(),
169            series_names: vec!["Sequence Length".to_string()],
170            x_categories: computed.x_categories.clone(),
171            title: "Distribution of sequence lengths over all sequences".to_string(),
172        })
173    }
174}
175
176impl QCModule for SequenceLengthDistribution {
177    fn process_sequence(&mut self, sequence: &Sequence) {
178        self.computed = None;
179        let seq_len = sequence.sequence.len();
180
181        // Array is extended to seqLen+2 to match Java's `seqLen+2 > lengthCounts.length`
182        if seq_len + 2 > self.length_counts.len() {
183            self.length_counts.resize(seq_len + 2, 0);
184        }
185
186        self.length_counts[seq_len] += 1;
187    }
188
189    fn name(&self) -> &str {
190        "Sequence Length Distribution"
191    }
192
193    fn description(&self) -> &str {
194        "Shows the distribution of sequence length over all sequences"
195    }
196
197    fn reset(&mut self) {
198        self.length_counts.clear();
199        self.computed = None;
200    }
201
202    fn finalize(&mut self) {
203        self.calculate_distribution();
204    }
205
206    fn raises_error(&self) -> bool {
207        // If error threshold is 0, the test is disabled
208        let threshold = self.limits.threshold("sequence_length\terror", 1.0);
209        if threshold == 0.0 {
210            return false;
211        }
212
213        // Error if there are sequences of length 0
214        if !self.length_counts.is_empty() && self.length_counts[0] > 0 {
215            return true;
216        }
217        false
218    }
219
220    fn raises_warning(&self) -> bool {
221        // If warn threshold is 0, the test is disabled
222        let threshold = self.limits.threshold("sequence_length\twarn", 1.0);
223        if threshold == 0.0 {
224            return false;
225        }
226
227        // Warn if there are sequences of different lengths
228        let mut seen_length = false;
229        for &count in &self.length_counts {
230            if count > 0 {
231                if seen_length {
232                    return true;
233                }
234                seen_length = true;
235            }
236        }
237        false
238    }
239
240    fn ignore_filtered_sequences(&self) -> bool {
241        true
242    }
243
244    fn ignore_in_report(&self) -> bool {
245        self.limits.is_ignored("sequence_length")
246    }
247
248    fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
249        let computed = self.ensure_calculated();
250
251        writeln!(writer, "#Length\tCount")?;
252        for i in 0..computed.x_categories.len() {
253            // Skip empty padding bins at the start and end
254            if (i == 0 || i == computed.x_categories.len() - 1) && computed.graph_counts[i] == 0.0 {
255                continue;
256            }
257            writeln!(
258                writer,
259                "{}\t{}",
260                computed.x_categories[i],
261                java_format_double(computed.graph_counts[i])
262            )?;
263        }
264        Ok(())
265    }
266
267    // Image filename matches Java's "sequence_length_distribution.png" in Images/
268    fn chart_image_name(&self) -> Option<&str> {
269        Some("sequence_length_distribution")
270    }
271    fn chart_alt_text(&self) -> Option<&str> {
272        Some("Sequence length distribution")
273    }
274    fn generate_chart_svg(&self) -> Option<String> {
275        Some(self.build_chart_svg())
276    }
277}