fastqc_rust/modules/
basic_stats.rs1use std::io;
5
6use crate::config::Limits;
7use crate::modules::QCModule;
8use crate::sequence::Sequence;
9use crate::utils::base_counts::{BASE_INDEX, IDX_A, IDX_C, IDX_G, IDX_N, IDX_T};
10use crate::utils::phred;
11
12pub struct BasicStats {
13 name: Option<String>,
14 actual_count: u64,
15 filtered_count: u64,
16 min_length: usize,
17 max_length: usize,
18 total_bases: u64,
19 g_count: u64,
20 c_count: u64,
21 a_count: u64,
22 t_count: u64,
23 n_count: u64,
24 lowest_char: u8,
27 file_type: Option<String>,
28}
29
30impl BasicStats {
31 pub fn new(_limits: &Limits) -> Self {
32 BasicStats {
33 name: None,
34 actual_count: 0,
35 filtered_count: 0,
36 min_length: 0,
37 max_length: 0,
38 total_bases: 0,
39 g_count: 0,
40 c_count: 0,
41 a_count: 0,
42 t_count: 0,
43 n_count: 0,
44 lowest_char: 126,
46 file_type: None,
47 }
48 }
49
50 pub fn set_file_name(&mut self, name: &str) {
54 let name = name.strip_prefix("stdin:").unwrap_or(name);
55 self.name = Some(name.to_string());
56 }
57
58 pub fn format_length(original_length: u64) -> String {
63 let mut length = original_length as f64;
64 let unit;
65
66 if length >= 1_000_000_000.0 {
67 length /= 1_000_000_000.0;
68 unit = " Gbp";
69 } else if length >= 1_000_000.0 {
70 length /= 1_000_000.0;
71 unit = " Mbp";
72 } else if length >= 1_000.0 {
73 length /= 1_000.0;
74 unit = " kbp";
75 } else {
76 unit = " bp";
77 }
78
79 let raw = format!("{}", length);
83 let chars: Vec<char> = raw.chars().collect();
84
85 let mut last_index = 0;
86
87 for (i, &ch) in chars.iter().enumerate() {
89 last_index = i;
90 if ch == '.' {
91 break;
92 }
93 }
94
95 if last_index + 1 < chars.len() && chars[last_index + 1] != '0' {
97 last_index += 1;
98 } else if last_index > 0 && chars[last_index] == '.' {
99 last_index -= 1;
101 }
102
103 let truncated: String = chars[..=last_index].iter().collect();
104 format!("{}{}", truncated, unit)
105 }
106}
107
108impl QCModule for BasicStats {
109 fn process_sequence(&mut self, sequence: &Sequence) {
110 if sequence.is_filtered {
112 self.filtered_count += 1;
113 return;
114 }
115
116 self.actual_count += 1;
117 self.total_bases += sequence.sequence.len() as u64;
118
119 if self.file_type.is_none() {
120 self.file_type = if sequence.colorspace.is_some() {
121 Some("Colorspace converted to bases".to_string())
122 } else {
123 Some("Conventional base calls".to_string())
124 };
125 }
126
127 let len = sequence.sequence.len();
129 if self.actual_count == 1 {
130 self.min_length = len;
131 self.max_length = len;
132 } else {
133 self.min_length = self.min_length.min(len);
134 self.max_length = self.max_length.max(len);
135 }
136
137 let mut counts = [0u64; 6];
139 for &b in &sequence.sequence {
140 counts[BASE_INDEX[b as usize] as usize] += 1;
141 }
142 self.a_count += counts[IDX_A];
143 self.c_count += counts[IDX_C];
144 self.g_count += counts[IDX_G];
145 self.t_count += counts[IDX_T];
146 self.n_count += counts[IDX_N];
147
148 for &q in &sequence.quality {
149 if q < self.lowest_char {
150 self.lowest_char = q;
151 }
152 }
153 }
154
155 fn set_filename(&mut self, name: &str) {
156 self.set_file_name(name);
157 }
158
159 fn name(&self) -> &str {
160 "Basic Statistics"
161 }
162
163 fn description(&self) -> &str {
164 "Calculates some basic statistics about the file"
165 }
166
167 fn reset(&mut self) {
168 self.min_length = 0;
169 self.max_length = 0;
170 self.g_count = 0;
171 self.c_count = 0;
172 self.a_count = 0;
173 self.t_count = 0;
174 self.n_count = 0;
175 }
176
177 fn raises_error(&self) -> bool {
179 false
180 }
181
182 fn raises_warning(&self) -> bool {
183 false
184 }
185
186 fn ignore_filtered_sequences(&self) -> bool {
187 false
189 }
190
191 fn ignore_in_report(&self) -> bool {
192 false
193 }
194
195 fn write_text_report(&self, writer: &mut dyn io::Write) -> io::Result<()> {
196 writeln!(writer, "#Measure\tValue")?;
198
199 writeln!(writer, "Filename\t{}", self.name.as_deref().unwrap_or(""))?;
201
202 writeln!(
204 writer,
205 "File type\t{}",
206 self.file_type
207 .as_deref()
208 .unwrap_or("Conventional base calls")
209 )?;
210
211 let encoding_name = phred::detect(self.lowest_char)
214 .map(|e| e.name.to_string())
215 .unwrap_or_else(|_| "Unknown".to_string());
216 writeln!(writer, "Encoding\t{}", encoding_name)?;
217
218 writeln!(writer, "Total Sequences\t{}", self.actual_count)?;
220
221 writeln!(
223 writer,
224 "Total Bases\t{}",
225 Self::format_length(self.total_bases)
226 )?;
227
228 writeln!(
230 writer,
231 "Sequences flagged as poor quality\t{}",
232 self.filtered_count
233 )?;
234
235 if self.min_length == self.max_length {
237 writeln!(writer, "Sequence length\t{}", self.min_length)?;
238 } else {
239 writeln!(
240 writer,
241 "Sequence length\t{}-{}",
242 self.min_length, self.max_length
243 )?;
244 }
245
246 let total = self.a_count + self.t_count + self.g_count + self.c_count;
249 let gc = ((self.g_count + self.c_count) * 100)
250 .checked_div(total)
251 .unwrap_or(0);
252 writeln!(writer, "%GC\t{}", gc)?;
253
254 Ok(())
255 }
256}
257
258#[cfg(test)]
259mod tests {
260 use super::*;
261
262 #[test]
263 fn test_format_length_bp() {
264 assert_eq!(BasicStats::format_length(16), "16 bp");
265 assert_eq!(BasicStats::format_length(80), "80 bp");
266 assert_eq!(BasicStats::format_length(999), "999 bp");
267 }
268
269 #[test]
270 fn test_format_length_kbp() {
271 assert_eq!(BasicStats::format_length(1000), "1 kbp");
272 assert_eq!(BasicStats::format_length(1500), "1.5 kbp");
273 assert_eq!(BasicStats::format_length(10000), "10 kbp");
274 }
275
276 #[test]
277 fn test_format_length_mbp() {
278 assert_eq!(BasicStats::format_length(1_000_000), "1 Mbp");
279 assert_eq!(BasicStats::format_length(1_200_000), "1.2 Mbp");
280 }
281
282 #[test]
283 fn test_format_length_gbp() {
284 assert_eq!(BasicStats::format_length(1_000_000_000), "1 Gbp");
285 }
286}