Skip to main content

holodeck_lib/
fasta.rs

1use std::fs::File;
2use std::io::{Seek, SeekFrom};
3use std::path::Path;
4
5use anyhow::{Context, Result, anyhow};
6use noodles::core::Region;
7use noodles::fasta;
8
9use crate::sequence_dict::SequenceDictionary;
10
11/// A loaded FASTA reference with random-access sequence retrieval.
12///
13/// Wraps a noodles `IndexedReader` for efficient seek-based access.  Contig
14/// metadata (names, lengths) is pre-computed from the `.fai` index at
15/// construction time.
16pub struct Fasta {
17    /// Noodles indexed FASTA reader.
18    reader: fasta::io::IndexedReader<fasta::io::BufReader<File>>,
19    /// Sequence dictionary built from the `.fai` index.
20    dict: SequenceDictionary,
21}
22
23impl Fasta {
24    /// Open a FASTA file and its `.fai` index.
25    ///
26    /// The `.fai` file must exist alongside the FASTA (e.g. `ref.fa.fai`).
27    ///
28    /// # Errors
29    /// Returns an error if the FASTA or its index cannot be read.
30    pub fn from_path(path: &Path) -> Result<Self> {
31        let reader = fasta::io::indexed_reader::Builder::default()
32            .build_from_path(path)
33            .with_context(|| format!("Failed to open indexed FASTA: {}", path.display()))?;
34
35        let dict = SequenceDictionary::from(reader.index());
36
37        Ok(Self { reader, dict })
38    }
39
40    /// Return a reference to the underlying sequence dictionary.
41    #[must_use]
42    pub fn dict(&self) -> &SequenceDictionary {
43        &self.dict
44    }
45
46    /// Load and return the full sequence of `contig_name` as uppercase bytes.
47    ///
48    /// Uses seek-based access via the FAI index for efficiency, avoiding the
49    /// intermediate `Record` allocation that `query()` performs.
50    ///
51    /// # Errors
52    /// Returns an error if the contig is unknown or the FASTA cannot be read.
53    pub fn load_contig(&mut self, contig_name: &str) -> Result<Vec<u8>> {
54        let meta = self
55            .dict
56            .get_by_name(contig_name)
57            .ok_or_else(|| anyhow!("Contig '{contig_name}' not found in reference"))?;
58        let expected_len = meta.length();
59
60        let region = Region::new(contig_name, ..);
61        let pos =
62            self.reader.index().query(&region).with_context(|| {
63                format!("Failed to look up contig '{contig_name}' in FASTA index")
64            })?;
65        self.reader
66            .get_mut()
67            .seek(SeekFrom::Start(pos))
68            .with_context(|| format!("Failed to seek to contig '{contig_name}' in FASTA"))?;
69
70        let mut seq = Vec::with_capacity(expected_len);
71        self.reader
72            .read_sequence(&mut seq)
73            .with_context(|| format!("Failed to read contig '{contig_name}' from FASTA"))?;
74
75        // Uppercase all bases for consistent matching.
76        for b in &mut seq {
77            *b = b.to_ascii_uppercase();
78        }
79        Ok(seq)
80    }
81
82    /// Return contig names in index order.
83    #[must_use]
84    pub fn contig_names(&self) -> Vec<&str> {
85        self.dict.names()
86    }
87}
88
89/// Write a FASTA file and its `.fai` index from programmatic data.
90///
91/// Each entry in `contigs` is `(name, sequence)`. Returns the path to the
92/// FASTA file.
93///
94/// # Panics
95/// Panics on any I/O error. This is a test-only helper.
96#[cfg(test)]
97#[must_use]
98pub fn write_test_fasta(dir: &std::path::Path, contigs: &[(&str, &[u8])]) -> std::path::PathBuf {
99    use std::io::Write;
100
101    let fasta_path = dir.join("ref.fa");
102    let fai_path = dir.join("ref.fa.fai");
103
104    let mut fa = std::fs::File::create(&fasta_path).unwrap();
105    let mut fai = std::fs::File::create(&fai_path).unwrap();
106
107    let mut offset: u64 = 0;
108    for &(name, seq) in contigs {
109        // Write FASTA: >name\nsequence\n
110        let header = format!(">{name}\n");
111        fa.write_all(header.as_bytes()).unwrap();
112        offset += header.len() as u64;
113
114        let seq_offset = offset;
115        // Write sequence in 80-char lines.
116        let line_len = 80;
117        for chunk in seq.chunks(line_len) {
118            fa.write_all(chunk).unwrap();
119            fa.write_all(b"\n").unwrap();
120        }
121
122        let seq_len = seq.len();
123        let line_bases = line_len;
124        let line_width = line_len + 1; // bases + newline
125        // FAI format: name\tlength\toffset\tline_bases\tline_width
126        writeln!(fai, "{name}\t{seq_len}\t{seq_offset}\t{line_bases}\t{line_width}").unwrap();
127
128        // Update offset: seq bytes + newlines
129        let full_lines = seq_len / line_len;
130        let remainder = seq_len % line_len;
131        offset += (full_lines * line_width) as u64;
132        if remainder > 0 {
133            offset += (remainder + 1) as u64;
134        }
135    }
136
137    fasta_path
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn test_load_contig_single() {
146        let dir = tempfile::tempdir().unwrap();
147        let path = write_test_fasta(dir.path(), &[("chr1", b"ACGTACGT")]);
148        let mut fasta = Fasta::from_path(&path).unwrap();
149
150        assert_eq!(fasta.contig_names(), vec!["chr1"]);
151        assert_eq!(fasta.dict().len(), 1);
152
153        let seq = fasta.load_contig("chr1").unwrap();
154        assert_eq!(&seq, b"ACGTACGT");
155    }
156
157    #[test]
158    fn test_load_contig_multiple() {
159        let dir = tempfile::tempdir().unwrap();
160        let path = write_test_fasta(
161            dir.path(),
162            &[("chr1", b"AAAA"), ("chr2", b"CCCCGGGG"), ("chr3", b"TT")],
163        );
164        let mut fasta = Fasta::from_path(&path).unwrap();
165
166        assert_eq!(fasta.contig_names(), vec!["chr1", "chr2", "chr3"]);
167
168        let seq1 = fasta.load_contig("chr1").unwrap();
169        assert_eq!(&seq1, b"AAAA");
170
171        let seq2 = fasta.load_contig("chr2").unwrap();
172        assert_eq!(&seq2, b"CCCCGGGG");
173
174        let seq3 = fasta.load_contig("chr3").unwrap();
175        assert_eq!(&seq3, b"TT");
176    }
177
178    #[test]
179    fn test_load_contig_out_of_order() {
180        let dir = tempfile::tempdir().unwrap();
181        let path = write_test_fasta(dir.path(), &[("chr1", b"AAAA"), ("chr2", b"CCCC")]);
182        let mut fasta = Fasta::from_path(&path).unwrap();
183
184        // Load chr2 first, then chr1 — verify seeking works correctly.
185        let seq2 = fasta.load_contig("chr2").unwrap();
186        assert_eq!(&seq2, b"CCCC");
187
188        let seq1 = fasta.load_contig("chr1").unwrap();
189        assert_eq!(&seq1, b"AAAA");
190    }
191
192    #[test]
193    fn test_load_contig_uppercase() {
194        let dir = tempfile::tempdir().unwrap();
195        let path = write_test_fasta(dir.path(), &[("chr1", b"acgtNn")]);
196        let mut fasta = Fasta::from_path(&path).unwrap();
197
198        let seq = fasta.load_contig("chr1").unwrap();
199        assert_eq!(&seq, b"ACGTNN");
200    }
201
202    #[test]
203    fn test_load_contig_unknown() {
204        let dir = tempfile::tempdir().unwrap();
205        let path = write_test_fasta(dir.path(), &[("chr1", b"ACGT")]);
206        let mut fasta = Fasta::from_path(&path).unwrap();
207
208        let result = fasta.load_contig("chrZ");
209        assert!(result.is_err());
210        assert!(result.unwrap_err().to_string().contains("not found"));
211    }
212}