use std::io::{BufRead, Write};
#[derive(Debug)]
pub struct FastaRecord {
pub label: String,
pub abundance: Option<u64>,
pub seq: Vec<u8>,
}
pub fn parse_fasta(reader: &mut dyn BufRead) -> anyhow::Result<Vec<(String, Vec<u8>)>> {
let mut records: Vec<(String, Vec<u8>)> = Vec::new();
let mut current_label: Option<String> = None;
let mut current_seq: Vec<u8> = Vec::new();
let mut line = String::new();
loop {
line.clear();
let n = reader.read_line(&mut line)?;
if n == 0 {
break;
}
let trimmed = line.trim_end_matches(['\n', '\r']);
if trimmed.is_empty() {
continue;
}
if let Some(rest) = trimmed.strip_prefix('>') {
if let Some(label) = current_label.take() {
records.push((label, current_seq.clone()));
current_seq.clear();
}
current_label = Some(rest.to_owned());
} else if current_label.is_some() {
current_seq.extend_from_slice(trimmed.as_bytes());
}
}
if let Some(label) = current_label.take() {
records.push((label, current_seq));
}
Ok(records)
}
#[must_use]
pub fn strip_size(label: &str) -> (String, Option<u64>) {
let parts: Vec<&str> = label.split(';').collect();
let mut size_value: Option<u64> = None;
let mut kept: Vec<&str> = Vec::with_capacity(parts.len());
for part in &parts {
if size_value.is_none()
&& part.starts_with("size=")
&& let Ok(n) = part["size=".len()..].parse::<u64>()
{
size_value = Some(n);
continue;
}
kept.push(part);
}
(kept.join(";"), size_value)
}
pub fn write_record(
out: &mut dyn Write,
label: &str,
sizeout: bool,
seq: &[u8],
width: usize,
) -> std::io::Result<()> {
if sizeout {
writeln!(out, ">{label};size=1")?;
} else {
writeln!(out, ">{label}")?;
}
if width == 0 {
out.write_all(seq)?;
writeln!(out)?;
} else {
for chunk in seq.chunks(width) {
out.write_all(chunk)?;
writeln!(out)?;
}
}
Ok(())
}
pub fn rereplicate(
reader: &mut dyn BufRead,
writer: &mut dyn Write,
sizeout: bool,
fasta_width: usize,
) -> anyhow::Result<(u64, u64, u64)> {
let records = parse_fasta(reader)?;
let mut amplicons: u64 = 0;
let mut reads: u64 = 0;
let mut missing: u64 = 0;
for (raw_label, seq) in records {
amplicons += 1;
let (label, size_opt) = strip_size(&raw_label);
let abundance = if let Some(n) = size_opt {
n
} else {
missing += 1;
1
};
for _ in 0..abundance {
reads += 1;
write_record(writer, &label, sizeout, &seq, fasta_width)?;
}
}
Ok((amplicons, reads, missing))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strip_size_trailing() {
let (s, v) = strip_size("seq1;size=5");
assert_eq!(s, "seq1");
assert_eq!(v, Some(5));
}
#[test]
fn strip_size_middle() {
let (s, v) = strip_size("seq1;k=v;size=3;extra=x");
assert_eq!(s, "seq1;k=v;extra=x");
assert_eq!(v, Some(3));
}
#[test]
fn strip_size_absent() {
let (s, v) = strip_size("seq1;k=v");
assert_eq!(s, "seq1;k=v");
assert_eq!(v, None);
}
#[test]
fn strip_size_non_numeric_kept() {
let (s, v) = strip_size("seq1;size=abc");
assert_eq!(s, "seq1;size=abc");
assert_eq!(v, None);
}
#[test]
fn rereplicate_basic() {
use std::io::BufReader;
let input = b">seq1;size=3\nACGT\n>seq2;size=1\nTTTT\n";
let mut reader = BufReader::new(input.as_ref());
let mut out: Vec<u8> = Vec::new();
let (amplicons, reads, missing) = rereplicate(&mut reader, &mut out, false, 80).unwrap();
assert_eq!(amplicons, 2);
assert_eq!(reads, 4);
assert_eq!(missing, 0);
let s = String::from_utf8(out).unwrap();
assert_eq!(s, ">seq1\nACGT\n>seq1\nACGT\n>seq1\nACGT\n>seq2\nTTTT\n");
}
#[test]
fn rereplicate_sizeout() {
use std::io::BufReader;
let input = b">seq1;size=2\nACGT\n";
let mut reader = BufReader::new(input.as_ref());
let mut out: Vec<u8> = Vec::new();
rereplicate(&mut reader, &mut out, true, 80).unwrap();
let s = String::from_utf8(out).unwrap();
assert_eq!(s, ">seq1;size=1\nACGT\n>seq1;size=1\nACGT\n");
}
#[test]
fn rereplicate_missing_abundance_treated_as_one() {
use std::io::BufReader;
let input = b">seq1\nACGT\n";
let mut reader = BufReader::new(input.as_ref());
let mut out: Vec<u8> = Vec::new();
let (amplicons, reads, missing) = rereplicate(&mut reader, &mut out, false, 80).unwrap();
assert_eq!(amplicons, 1);
assert_eq!(reads, 1);
assert_eq!(missing, 1);
let s = String::from_utf8(out).unwrap();
assert_eq!(s, ">seq1\nACGT\n");
}
#[test]
fn rereplicate_fasta_width() {
use std::io::BufReader;
let seq = "ACGTACGTAC"; let input = format!(">s;size=1\n{seq}\n");
let mut reader = BufReader::new(input.as_bytes());
let mut out: Vec<u8> = Vec::new();
rereplicate(&mut reader, &mut out, false, 4).unwrap();
let s = String::from_utf8(out).unwrap();
assert_eq!(s, ">s\nACGT\nACGT\nAC\n");
}
}