rsomics_rereplicate/
lib.rs1use std::io::{BufRead, Write};
20
21#[derive(Debug)]
23pub struct FastaRecord {
24 pub label: String,
26 pub abundance: Option<u64>,
28 pub seq: Vec<u8>,
30}
31
32pub fn parse_fasta(reader: &mut dyn BufRead) -> anyhow::Result<Vec<(String, Vec<u8>)>> {
37 let mut records: Vec<(String, Vec<u8>)> = Vec::new();
38 let mut current_label: Option<String> = None;
39 let mut current_seq: Vec<u8> = Vec::new();
40 let mut line = String::new();
41
42 loop {
43 line.clear();
44 let n = reader.read_line(&mut line)?;
45 if n == 0 {
46 break;
47 }
48 let trimmed = line.trim_end_matches(['\n', '\r']);
49 if trimmed.is_empty() {
50 continue;
51 }
52 if let Some(rest) = trimmed.strip_prefix('>') {
53 if let Some(label) = current_label.take() {
54 records.push((label, current_seq.clone()));
55 current_seq.clear();
56 }
57 current_label = Some(rest.to_owned());
58 } else if current_label.is_some() {
59 current_seq.extend_from_slice(trimmed.as_bytes());
60 }
61 }
62 if let Some(label) = current_label.take() {
63 records.push((label, current_seq));
64 }
65
66 Ok(records)
67}
68
69#[must_use]
73pub fn strip_size(label: &str) -> (String, Option<u64>) {
74 let parts: Vec<&str> = label.split(';').collect();
75 let mut size_value: Option<u64> = None;
76 let mut kept: Vec<&str> = Vec::with_capacity(parts.len());
77
78 for part in &parts {
79 if size_value.is_none()
80 && part.starts_with("size=")
81 && let Ok(n) = part["size=".len()..].parse::<u64>()
82 {
83 size_value = Some(n);
84 continue;
85 }
86 kept.push(part);
87 }
88
89 (kept.join(";"), size_value)
90}
91
92pub fn write_record(
95 out: &mut dyn Write,
96 label: &str,
97 sizeout: bool,
98 seq: &[u8],
99 width: usize,
100) -> std::io::Result<()> {
101 if sizeout {
102 writeln!(out, ">{label};size=1")?;
103 } else {
104 writeln!(out, ">{label}")?;
105 }
106 if width == 0 {
107 out.write_all(seq)?;
108 writeln!(out)?;
109 } else {
110 for chunk in seq.chunks(width) {
111 out.write_all(chunk)?;
112 writeln!(out)?;
113 }
114 }
115 Ok(())
116}
117
118pub fn rereplicate(
129 reader: &mut dyn BufRead,
130 writer: &mut dyn Write,
131 sizeout: bool,
132 fasta_width: usize,
133) -> anyhow::Result<(u64, u64, u64)> {
134 let records = parse_fasta(reader)?;
135 let mut amplicons: u64 = 0;
136 let mut reads: u64 = 0;
137 let mut missing: u64 = 0;
138
139 for (raw_label, seq) in records {
140 amplicons += 1;
141 let (label, size_opt) = strip_size(&raw_label);
142 let abundance = if let Some(n) = size_opt {
143 n
144 } else {
145 missing += 1;
146 1
147 };
148
149 for _ in 0..abundance {
150 reads += 1;
151 write_record(writer, &label, sizeout, &seq, fasta_width)?;
152 }
153 }
154
155 Ok((amplicons, reads, missing))
156}
157
158#[cfg(test)]
159mod tests {
160 use super::*;
161
162 #[test]
163 fn strip_size_trailing() {
164 let (s, v) = strip_size("seq1;size=5");
165 assert_eq!(s, "seq1");
166 assert_eq!(v, Some(5));
167 }
168
169 #[test]
170 fn strip_size_middle() {
171 let (s, v) = strip_size("seq1;k=v;size=3;extra=x");
172 assert_eq!(s, "seq1;k=v;extra=x");
173 assert_eq!(v, Some(3));
174 }
175
176 #[test]
177 fn strip_size_absent() {
178 let (s, v) = strip_size("seq1;k=v");
179 assert_eq!(s, "seq1;k=v");
180 assert_eq!(v, None);
181 }
182
183 #[test]
184 fn strip_size_non_numeric_kept() {
185 let (s, v) = strip_size("seq1;size=abc");
186 assert_eq!(s, "seq1;size=abc");
187 assert_eq!(v, None);
188 }
189
190 #[test]
191 fn rereplicate_basic() {
192 use std::io::BufReader;
193 let input = b">seq1;size=3\nACGT\n>seq2;size=1\nTTTT\n";
194 let mut reader = BufReader::new(input.as_ref());
195 let mut out: Vec<u8> = Vec::new();
196 let (amplicons, reads, missing) = rereplicate(&mut reader, &mut out, false, 80).unwrap();
197 assert_eq!(amplicons, 2);
198 assert_eq!(reads, 4);
199 assert_eq!(missing, 0);
200 let s = String::from_utf8(out).unwrap();
201 assert_eq!(s, ">seq1\nACGT\n>seq1\nACGT\n>seq1\nACGT\n>seq2\nTTTT\n");
202 }
203
204 #[test]
205 fn rereplicate_sizeout() {
206 use std::io::BufReader;
207 let input = b">seq1;size=2\nACGT\n";
208 let mut reader = BufReader::new(input.as_ref());
209 let mut out: Vec<u8> = Vec::new();
210 rereplicate(&mut reader, &mut out, true, 80).unwrap();
211 let s = String::from_utf8(out).unwrap();
212 assert_eq!(s, ">seq1;size=1\nACGT\n>seq1;size=1\nACGT\n");
213 }
214
215 #[test]
216 fn rereplicate_missing_abundance_treated_as_one() {
217 use std::io::BufReader;
218 let input = b">seq1\nACGT\n";
219 let mut reader = BufReader::new(input.as_ref());
220 let mut out: Vec<u8> = Vec::new();
221 let (amplicons, reads, missing) = rereplicate(&mut reader, &mut out, false, 80).unwrap();
222 assert_eq!(amplicons, 1);
223 assert_eq!(reads, 1);
224 assert_eq!(missing, 1);
225 let s = String::from_utf8(out).unwrap();
226 assert_eq!(s, ">seq1\nACGT\n");
227 }
228
229 #[test]
230 fn rereplicate_fasta_width() {
231 use std::io::BufReader;
232 let seq = "ACGTACGTAC"; let input = format!(">s;size=1\n{seq}\n");
234 let mut reader = BufReader::new(input.as_bytes());
235 let mut out: Vec<u8> = Vec::new();
236 rereplicate(&mut reader, &mut out, false, 4).unwrap();
237 let s = String::from_utf8(out).unwrap();
238 assert_eq!(s, ">s\nACGT\nACGT\nAC\n");
240 }
241}