1use rsomics_common::{Result, RsomicsError};
2use std::io::Write;
3use std::path::Path;
4
5#[allow(clippy::cast_precision_loss)]
6pub fn seq_stats(input: &Path, output: &mut dyn Write) -> Result<()> {
7 let mut reader = needletail::parse_fastx_file(input)
8 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
9
10 let mut lengths: Vec<u64> = Vec::new();
11 let mut gc = 0u64;
12 let mut total_bp = 0u64;
13
14 while let Some(result) = reader.next() {
15 let record = result.map_err(|e| RsomicsError::InvalidInput(format!("read: {e}")))?;
16 let seq = record.seq();
17 let len = seq.len() as u64;
18 lengths.push(len);
19 total_bp += len;
20 gc += seq
21 .iter()
22 .filter(|b| matches!(b, b'G' | b'g' | b'C' | b'c'))
23 .count() as u64;
24 }
25
26 if lengths.is_empty() {
27 return Err(RsomicsError::InvalidInput("no sequences".into()));
28 }
29
30 lengths.sort_unstable();
31 let n = lengths.len();
32 let min = lengths[0];
33 let max = lengths[n - 1];
34 let mean = total_bp as f64 / n as f64;
35
36 let mut cumsum = 0u64;
37 let half = total_bp / 2;
38 let mut n50 = 0u64;
39 for &l in lengths.iter().rev() {
40 cumsum += l;
41 if cumsum >= half {
42 n50 = l;
43 break;
44 }
45 }
46
47 let gc_pct = if total_bp > 0 {
48 gc as f64 / total_bp as f64 * 100.0
49 } else {
50 0.0
51 };
52
53 writeln!(output, "sequences\t{n}").map_err(RsomicsError::Io)?;
54 writeln!(output, "total_bp\t{total_bp}").map_err(RsomicsError::Io)?;
55 writeln!(output, "min_len\t{min}").map_err(RsomicsError::Io)?;
56 writeln!(output, "max_len\t{max}").map_err(RsomicsError::Io)?;
57 writeln!(output, "mean_len\t{mean:.1}").map_err(RsomicsError::Io)?;
58 writeln!(output, "N50\t{n50}").map_err(RsomicsError::Io)?;
59 writeln!(output, "GC%\t{gc_pct:.2}").map_err(RsomicsError::Io)?;
60 Ok(())
61}