Skip to main content

holodeck_lib/
fasta.rs

1use std::fs::File;
2use std::io::{Seek, SeekFrom};
3use std::path::Path;
4
5use anyhow::{Context, Result, anyhow};
6use noodles::core::Region;
7use noodles::fasta;
8use rand::Rng;
9
10use crate::sequence_dict::SequenceDictionary;
11
12/// A loaded FASTA reference with random-access sequence retrieval.
13///
14/// Wraps a noodles `IndexedReader` for efficient seek-based access.  Contig
15/// metadata (names, lengths) is pre-computed from the `.fai` index at
16/// construction time.
17pub struct Fasta {
18    /// Noodles indexed FASTA reader.
19    reader: fasta::io::IndexedReader<fasta::io::BufReader<File>>,
20    /// Sequence dictionary built from the `.fai` index.
21    dict: SequenceDictionary,
22}
23
24impl Fasta {
25    /// Open a FASTA file and its `.fai` index.
26    ///
27    /// The `.fai` file must exist alongside the FASTA (e.g. `ref.fa.fai`).
28    ///
29    /// # Errors
30    /// Returns an error if the FASTA or its index cannot be read.
31    pub fn from_path(path: &Path) -> Result<Self> {
32        let reader = fasta::io::indexed_reader::Builder::default()
33            .build_from_path(path)
34            .with_context(|| format!("Failed to open indexed FASTA: {}", path.display()))?;
35
36        let dict = SequenceDictionary::from(reader.index());
37
38        Ok(Self { reader, dict })
39    }
40
41    /// Return a reference to the underlying sequence dictionary.
42    #[must_use]
43    pub fn dict(&self) -> &SequenceDictionary {
44        &self.dict
45    }
46
47    /// Load and return the full sequence of `contig_name`.
48    ///
49    /// Uses seek-based access via the FAI index for efficiency, avoiding the
50    /// intermediate `Record` allocation that `query()` performs.
51    ///
52    /// Bases are normalized during loading:
53    /// - `A`, `C`, `G`, `T` (case-insensitive) are stored as uppercase.
54    /// - `U` (RNA) is converted to `T`.
55    /// - Any other IUPAC ambiguity code (`R`, `Y`, `S`, `W`, `K`, `M`, `B`,
56    ///   `D`, `H`, `V`, `N`) is resolved to a uniformly-random base drawn
57    ///   from that code's ambiguity set and stored in **lowercase**. The
58    ///   lowercase marker allows downstream sampling to detect and optionally
59    ///   reject reads that contain too many synthesized bases (see
60    ///   [`crate::fragment::lowercase_fraction`]).
61    /// - Any other byte is rejected with an error.
62    ///
63    /// Passing the same `rng` state produces a deterministic resolution, so
64    /// two runs seeded identically produce identical reference buffers.
65    ///
66    /// # Errors
67    /// Returns an error if the contig is unknown, the FASTA cannot be read,
68    /// or the contig contains an unrecognized base.
69    pub fn load_contig(&mut self, contig_name: &str, rng: &mut impl Rng) -> Result<Vec<u8>> {
70        let meta = self
71            .dict
72            .get_by_name(contig_name)
73            .ok_or_else(|| anyhow!("Contig '{contig_name}' not found in reference"))?;
74        let expected_len = meta.length();
75
76        let region = Region::new(contig_name, ..);
77        let pos =
78            self.reader.index().query(&region).with_context(|| {
79                format!("Failed to look up contig '{contig_name}' in FASTA index")
80            })?;
81        self.reader
82            .get_mut()
83            .seek(SeekFrom::Start(pos))
84            .with_context(|| format!("Failed to seek to contig '{contig_name}' in FASTA"))?;
85
86        let mut seq = Vec::with_capacity(expected_len);
87        self.reader
88            .read_sequence(&mut seq)
89            .with_context(|| format!("Failed to read contig '{contig_name}' from FASTA"))?;
90
91        normalize_bases(&mut seq, contig_name, rng)?;
92        Ok(seq)
93    }
94
95    /// Return contig names in index order.
96    #[must_use]
97    pub fn contig_names(&self) -> Vec<&str> {
98        self.dict.names()
99    }
100}
101
102/// Resolve a non-ACGT IUPAC code to one uniformly-chosen valid base.
103///
104/// The input byte must already be uppercase. Returns `None` if the input
105/// is not a recognized IUPAC code (including plain `A`/`C`/`G`/`T`, which
106/// the caller should handle before calling this function).
107fn resolve_iupac(code: u8, rng: &mut impl Rng) -> Option<u8> {
108    // Ambiguity sets per the IUPAC nucleotide code table.
109    let alts: &[u8] = match code {
110        b'R' => b"AG",
111        b'Y' => b"CT",
112        b'S' => b"CG",
113        b'W' => b"AT",
114        b'K' => b"GT",
115        b'M' => b"AC",
116        b'B' => b"CGT",
117        b'D' => b"AGT",
118        b'H' => b"ACT",
119        b'V' => b"ACG",
120        b'N' => b"ACGT",
121        _ => return None,
122    };
123    Some(alts[rng.random_range(0..alts.len())])
124}
125
126/// Normalize a contig's bases in place per the rules documented on
127/// [`Fasta::load_contig`].
128///
129/// Invalid bytes produce an error identifying the offending byte and its
130/// 0-based position within the contig.
131fn normalize_bases(seq: &mut [u8], contig_name: &str, rng: &mut impl Rng) -> Result<()> {
132    for (i, b) in seq.iter_mut().enumerate() {
133        let upper = b.to_ascii_uppercase();
134        match upper {
135            b'A' | b'C' | b'G' | b'T' => *b = upper,
136            b'U' => *b = b'T',
137            b'R' | b'Y' | b'S' | b'W' | b'K' | b'M' | b'B' | b'D' | b'H' | b'V' | b'N' => {
138                // Resolve to lowercase (marker for "synthesized from ambiguity").
139                let picked = resolve_iupac(upper, rng).expect("code is a valid IUPAC code");
140                *b = picked.to_ascii_lowercase();
141            }
142            _ => {
143                return Err(anyhow!(
144                    "Contig '{contig_name}' contains unrecognized base {:?} (0x{:02X}) at position {i}",
145                    char::from(*b),
146                    *b
147                ));
148            }
149        }
150    }
151    Ok(())
152}
153
154/// Write a FASTA file and its `.fai` index from programmatic data.
155///
156/// Each entry in `contigs` is `(name, sequence)`. Returns the path to the
157/// FASTA file.
158///
159/// # Panics
160/// Panics on any I/O error. This is a test-only helper.
161#[cfg(test)]
162#[must_use]
163pub fn write_test_fasta(dir: &std::path::Path, contigs: &[(&str, &[u8])]) -> std::path::PathBuf {
164    use std::io::Write;
165
166    let fasta_path = dir.join("ref.fa");
167    let fai_path = dir.join("ref.fa.fai");
168
169    let mut fa = std::fs::File::create(&fasta_path).unwrap();
170    let mut fai = std::fs::File::create(&fai_path).unwrap();
171
172    let mut offset: u64 = 0;
173    for &(name, seq) in contigs {
174        // Write FASTA: >name\nsequence\n
175        let header = format!(">{name}\n");
176        fa.write_all(header.as_bytes()).unwrap();
177        offset += header.len() as u64;
178
179        let seq_offset = offset;
180        // Write sequence in 80-char lines.
181        let line_len = 80;
182        for chunk in seq.chunks(line_len) {
183            fa.write_all(chunk).unwrap();
184            fa.write_all(b"\n").unwrap();
185        }
186
187        let seq_len = seq.len();
188        let line_bases = line_len;
189        let line_width = line_len + 1; // bases + newline
190        // FAI format: name\tlength\toffset\tline_bases\tline_width
191        writeln!(fai, "{name}\t{seq_len}\t{seq_offset}\t{line_bases}\t{line_width}").unwrap();
192
193        // Update offset: seq bytes + newlines
194        let full_lines = seq_len / line_len;
195        let remainder = seq_len % line_len;
196        offset += (full_lines * line_width) as u64;
197        if remainder > 0 {
198            offset += (remainder + 1) as u64;
199        }
200    }
201
202    fasta_path
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208    use rand::SeedableRng;
209    use rand::rngs::SmallRng;
210
211    /// Build a fresh deterministic RNG for tests.
212    fn test_rng() -> SmallRng {
213        SmallRng::seed_from_u64(42)
214    }
215
216    #[test]
217    fn test_load_contig_single() {
218        let dir = tempfile::tempdir().unwrap();
219        let path = write_test_fasta(dir.path(), &[("chr1", b"ACGTACGT")]);
220        let mut fasta = Fasta::from_path(&path).unwrap();
221
222        assert_eq!(fasta.contig_names(), vec!["chr1"]);
223        assert_eq!(fasta.dict().len(), 1);
224
225        let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
226        assert_eq!(&seq, b"ACGTACGT");
227    }
228
229    #[test]
230    fn test_load_contig_multiple() {
231        let dir = tempfile::tempdir().unwrap();
232        let path = write_test_fasta(
233            dir.path(),
234            &[("chr1", b"AAAA"), ("chr2", b"CCCCGGGG"), ("chr3", b"TT")],
235        );
236        let mut fasta = Fasta::from_path(&path).unwrap();
237
238        assert_eq!(fasta.contig_names(), vec!["chr1", "chr2", "chr3"]);
239
240        let seq1 = fasta.load_contig("chr1", &mut test_rng()).unwrap();
241        assert_eq!(&seq1, b"AAAA");
242
243        let seq2 = fasta.load_contig("chr2", &mut test_rng()).unwrap();
244        assert_eq!(&seq2, b"CCCCGGGG");
245
246        let seq3 = fasta.load_contig("chr3", &mut test_rng()).unwrap();
247        assert_eq!(&seq3, b"TT");
248    }
249
250    #[test]
251    fn test_load_contig_out_of_order() {
252        let dir = tempfile::tempdir().unwrap();
253        let path = write_test_fasta(dir.path(), &[("chr1", b"AAAA"), ("chr2", b"CCCC")]);
254        let mut fasta = Fasta::from_path(&path).unwrap();
255
256        // Load chr2 first, then chr1 — verify seeking works correctly.
257        let seq2 = fasta.load_contig("chr2", &mut test_rng()).unwrap();
258        assert_eq!(&seq2, b"CCCC");
259
260        let seq1 = fasta.load_contig("chr1", &mut test_rng()).unwrap();
261        assert_eq!(&seq1, b"AAAA");
262    }
263
264    #[test]
265    fn test_load_contig_uppercases_acgt() {
266        let dir = tempfile::tempdir().unwrap();
267        let path = write_test_fasta(dir.path(), &[("chr1", b"acgtACGT")]);
268        let mut fasta = Fasta::from_path(&path).unwrap();
269
270        let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
271        assert_eq!(&seq, b"ACGTACGT");
272    }
273
274    #[test]
275    fn test_load_contig_u_to_t() {
276        let dir = tempfile::tempdir().unwrap();
277        let path = write_test_fasta(dir.path(), &[("chr1", b"AUGCUu")]);
278        let mut fasta = Fasta::from_path(&path).unwrap();
279
280        let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
281        assert_eq!(&seq, b"ATGCTT");
282    }
283
284    #[test]
285    fn test_load_contig_unknown() {
286        let dir = tempfile::tempdir().unwrap();
287        let path = write_test_fasta(dir.path(), &[("chr1", b"ACGT")]);
288        let mut fasta = Fasta::from_path(&path).unwrap();
289
290        let result = fasta.load_contig("chrZ", &mut test_rng());
291        assert!(result.is_err());
292        assert!(result.unwrap_err().to_string().contains("not found"));
293    }
294
295    #[test]
296    fn test_load_contig_rejects_unknown_byte() {
297        let dir = tempfile::tempdir().unwrap();
298        let path = write_test_fasta(dir.path(), &[("chr1", b"ACZT")]);
299        let mut fasta = Fasta::from_path(&path).unwrap();
300
301        let err = fasta.load_contig("chr1", &mut test_rng()).unwrap_err();
302        let msg = err.to_string();
303        assert!(msg.contains("unrecognized base"), "message was: {msg}");
304        assert!(msg.contains("position 2"), "message was: {msg}");
305    }
306
307    #[test]
308    fn test_load_contig_iupac_resolved_to_lowercase() {
309        let dir = tempfile::tempdir().unwrap();
310        let path = write_test_fasta(dir.path(), &[("chr1", b"ARYSWKMBDHVN")]);
311        let mut fasta = Fasta::from_path(&path).unwrap();
312
313        let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
314        assert_eq!(seq.len(), 12);
315        // First base is a real A.
316        assert_eq!(seq[0], b'A');
317        // All ambiguity-resolved bases should be lowercase ACGT.
318        for (i, &b) in seq.iter().enumerate().skip(1) {
319            assert!(matches!(b, b'a' | b'c' | b'g' | b't'), "position {i} got {b:?}");
320        }
321    }
322
323    #[test]
324    fn test_load_contig_resolution_respects_ambiguity_set() {
325        let dir = tempfile::tempdir().unwrap();
326        // Long runs so we observe multiple draws per code.
327        let path = write_test_fasta(
328            dir.path(),
329            &[("chr1", &[b'R'; 200]), ("chr2", &[b'Y'; 200]), ("chr3", &[b'K'; 200])],
330        );
331        let mut fasta = Fasta::from_path(&path).unwrap();
332
333        let r = fasta.load_contig("chr1", &mut test_rng()).unwrap();
334        let y = fasta.load_contig("chr2", &mut test_rng()).unwrap();
335        let k = fasta.load_contig("chr3", &mut test_rng()).unwrap();
336
337        // R ∈ {A,G} (lowercase)
338        for &b in &r {
339            assert!(matches!(b, b'a' | b'g'), "R resolved to {b:?}");
340        }
341        // Y ∈ {C,T}
342        for &b in &y {
343            assert!(matches!(b, b'c' | b't'), "Y resolved to {b:?}");
344        }
345        // K ∈ {G,T}
346        for &b in &k {
347            assert!(matches!(b, b'g' | b't'), "K resolved to {b:?}");
348        }
349    }
350
351    #[test]
352    fn test_load_contig_resolution_reproducible() {
353        let dir = tempfile::tempdir().unwrap();
354        let path = write_test_fasta(dir.path(), &[("chr1", b"NNNNNNNNNNRRYYSSWWKKMMBBDDHHVV")]);
355        let mut fasta_a = Fasta::from_path(&path).unwrap();
356        let mut fasta_b = Fasta::from_path(&path).unwrap();
357
358        let a = fasta_a.load_contig("chr1", &mut SmallRng::seed_from_u64(1234)).unwrap();
359        let b = fasta_b.load_contig("chr1", &mut SmallRng::seed_from_u64(1234)).unwrap();
360        assert_eq!(a, b);
361    }
362}