Skip to main content

rsomics_rereplicate/
lib.rs

1//! Rereplicate: expand `;size=N` abundance annotations back into N individual
2//! FASTA records.  Inverse of `vsearch --derep_fulllength`.
3//!
4//! Behaviour matches `vsearch --rereplicate` v2.31.0 (BSD-2):
5//!
6//! - Each input record with `;size=N` is emitted N times in order.
7//! - The `;size=N` annotation is **stripped** from the output header (vsearch
8//!   sets `opt_xsize=true` unconditionally for this command).
9//! - With `--sizeout`, each copy receives `;size=1` appended instead.
10//! - Records with no `;size=` annotation are treated as abundance 1 (emitted
11//!   once), with a warning logged.
12//! - No `minseqlength` or `maxseqlength` filtering is applied (those options
13//!   are not in the `--rereplicate` allowed-option set).
14//! - Sequence bytes are preserved exactly (case + U kept); the tool does not
15//!   normalise for this operation.
16//! - FASTA output wraps at `fasta_width` columns (default 80).
17//! - Output order mirrors input order; each record's copies appear together.
18
19use std::io::{BufRead, Write};
20
21/// One parsed FASTA record.
22#[derive(Debug)]
23pub struct FastaRecord {
24    /// Label with the first `;size=N` token stripped.
25    pub label: String,
26    /// Abundance parsed from `;size=N`; `None` if annotation was absent.
27    pub abundance: Option<u64>,
28    /// Raw sequence bytes as they appeared in the file.
29    pub seq: Vec<u8>,
30}
31
32/// Parse FASTA records from a buffered reader.
33///
34/// Each record yields `(raw_label, raw_seq_bytes)`.  Multi-line sequences are
35/// concatenated.  Empty lines between records are skipped.
36pub fn parse_fasta(reader: &mut dyn BufRead) -> anyhow::Result<Vec<(String, Vec<u8>)>> {
37    let mut records: Vec<(String, Vec<u8>)> = Vec::new();
38    let mut current_label: Option<String> = None;
39    let mut current_seq: Vec<u8> = Vec::new();
40    let mut line = String::new();
41
42    loop {
43        line.clear();
44        let n = reader.read_line(&mut line)?;
45        if n == 0 {
46            break;
47        }
48        let trimmed = line.trim_end_matches(['\n', '\r']);
49        if trimmed.is_empty() {
50            continue;
51        }
52        if let Some(rest) = trimmed.strip_prefix('>') {
53            if let Some(label) = current_label.take() {
54                records.push((label, current_seq.clone()));
55                current_seq.clear();
56            }
57            current_label = Some(rest.to_owned());
58        } else if current_label.is_some() {
59            current_seq.extend_from_slice(trimmed.as_bytes());
60        }
61    }
62    if let Some(label) = current_label.take() {
63        records.push((label, current_seq));
64    }
65
66    Ok(records)
67}
68
69/// Strip the first `;size=N` token from a label.
70///
71/// Returns `(stripped_label, Some(N))` when found, `(original, None)` otherwise.
72#[must_use]
73pub fn strip_size(label: &str) -> (String, Option<u64>) {
74    let parts: Vec<&str> = label.split(';').collect();
75    let mut size_value: Option<u64> = None;
76    let mut kept: Vec<&str> = Vec::with_capacity(parts.len());
77
78    for part in &parts {
79        if size_value.is_none()
80            && part.starts_with("size=")
81            && let Ok(n) = part["size=".len()..].parse::<u64>()
82        {
83            size_value = Some(n);
84            continue;
85        }
86        kept.push(part);
87    }
88
89    (kept.join(";"), size_value)
90}
91
92/// Write one FASTA record to `out`, wrapping sequence at `width` columns.
93/// `width == 0` means no wrapping.
94pub fn write_record(
95    out: &mut dyn Write,
96    label: &str,
97    sizeout: bool,
98    seq: &[u8],
99    width: usize,
100) -> std::io::Result<()> {
101    if sizeout {
102        writeln!(out, ">{label};size=1")?;
103    } else {
104        writeln!(out, ">{label}")?;
105    }
106    if width == 0 {
107        out.write_all(seq)?;
108        writeln!(out)?;
109    } else {
110        for chunk in seq.chunks(width) {
111            out.write_all(chunk)?;
112            writeln!(out)?;
113        }
114    }
115    Ok(())
116}
117
118/// Core rereplicate logic.
119///
120/// Reads FASTA from `reader` and writes each record expanded `abundance` times
121/// to `writer`.  Returns `(amplicons, reads, missing_abundance_count)`.
122///
123/// `sizeout` appends `;size=1` to each emitted copy.
124/// `fasta_width` sets the sequence line wrap column (0 = no wrap).
125///
126/// Records without a `;size=N` annotation are treated as abundance 1 and
127/// counted in `missing_abundance_count`.
128pub fn rereplicate(
129    reader: &mut dyn BufRead,
130    writer: &mut dyn Write,
131    sizeout: bool,
132    fasta_width: usize,
133) -> anyhow::Result<(u64, u64, u64)> {
134    let records = parse_fasta(reader)?;
135    let mut amplicons: u64 = 0;
136    let mut reads: u64 = 0;
137    let mut missing: u64 = 0;
138
139    for (raw_label, seq) in records {
140        amplicons += 1;
141        let (label, size_opt) = strip_size(&raw_label);
142        let abundance = if let Some(n) = size_opt {
143            n
144        } else {
145            missing += 1;
146            1
147        };
148
149        for _ in 0..abundance {
150            reads += 1;
151            write_record(writer, &label, sizeout, &seq, fasta_width)?;
152        }
153    }
154
155    Ok((amplicons, reads, missing))
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn strip_size_trailing() {
164        let (s, v) = strip_size("seq1;size=5");
165        assert_eq!(s, "seq1");
166        assert_eq!(v, Some(5));
167    }
168
169    #[test]
170    fn strip_size_middle() {
171        let (s, v) = strip_size("seq1;k=v;size=3;extra=x");
172        assert_eq!(s, "seq1;k=v;extra=x");
173        assert_eq!(v, Some(3));
174    }
175
176    #[test]
177    fn strip_size_absent() {
178        let (s, v) = strip_size("seq1;k=v");
179        assert_eq!(s, "seq1;k=v");
180        assert_eq!(v, None);
181    }
182
183    #[test]
184    fn strip_size_non_numeric_kept() {
185        let (s, v) = strip_size("seq1;size=abc");
186        assert_eq!(s, "seq1;size=abc");
187        assert_eq!(v, None);
188    }
189
190    #[test]
191    fn rereplicate_basic() {
192        use std::io::BufReader;
193        let input = b">seq1;size=3\nACGT\n>seq2;size=1\nTTTT\n";
194        let mut reader = BufReader::new(input.as_ref());
195        let mut out: Vec<u8> = Vec::new();
196        let (amplicons, reads, missing) = rereplicate(&mut reader, &mut out, false, 80).unwrap();
197        assert_eq!(amplicons, 2);
198        assert_eq!(reads, 4);
199        assert_eq!(missing, 0);
200        let s = String::from_utf8(out).unwrap();
201        assert_eq!(s, ">seq1\nACGT\n>seq1\nACGT\n>seq1\nACGT\n>seq2\nTTTT\n");
202    }
203
204    #[test]
205    fn rereplicate_sizeout() {
206        use std::io::BufReader;
207        let input = b">seq1;size=2\nACGT\n";
208        let mut reader = BufReader::new(input.as_ref());
209        let mut out: Vec<u8> = Vec::new();
210        rereplicate(&mut reader, &mut out, true, 80).unwrap();
211        let s = String::from_utf8(out).unwrap();
212        assert_eq!(s, ">seq1;size=1\nACGT\n>seq1;size=1\nACGT\n");
213    }
214
215    #[test]
216    fn rereplicate_missing_abundance_treated_as_one() {
217        use std::io::BufReader;
218        let input = b">seq1\nACGT\n";
219        let mut reader = BufReader::new(input.as_ref());
220        let mut out: Vec<u8> = Vec::new();
221        let (amplicons, reads, missing) = rereplicate(&mut reader, &mut out, false, 80).unwrap();
222        assert_eq!(amplicons, 1);
223        assert_eq!(reads, 1);
224        assert_eq!(missing, 1);
225        let s = String::from_utf8(out).unwrap();
226        assert_eq!(s, ">seq1\nACGT\n");
227    }
228
229    #[test]
230    fn rereplicate_fasta_width() {
231        use std::io::BufReader;
232        let seq = "ACGTACGTAC"; // 10 bases
233        let input = format!(">s;size=1\n{seq}\n");
234        let mut reader = BufReader::new(input.as_bytes());
235        let mut out: Vec<u8> = Vec::new();
236        rereplicate(&mut reader, &mut out, false, 4).unwrap();
237        let s = String::from_utf8(out).unwrap();
238        // Should wrap at 4: ACGT\nACGT\nAC\n
239        assert_eq!(s, ">s\nACGT\nACGT\nAC\n");
240    }
241}