1use std::fs::File;
2use std::io::{Seek, SeekFrom};
3use std::path::Path;
4
5use anyhow::{Context, Result, anyhow};
6use noodles::core::Region;
7use noodles::fasta;
8use rand::Rng;
9
10use crate::sequence_dict::SequenceDictionary;
11
12pub struct Fasta {
18 reader: fasta::io::IndexedReader<fasta::io::BufReader<File>>,
20 dict: SequenceDictionary,
22}
23
24impl Fasta {
25 pub fn from_path(path: &Path) -> Result<Self> {
32 let reader = fasta::io::indexed_reader::Builder::default()
33 .build_from_path(path)
34 .with_context(|| format!("Failed to open indexed FASTA: {}", path.display()))?;
35
36 let dict = SequenceDictionary::from(reader.index());
37
38 Ok(Self { reader, dict })
39 }
40
41 #[must_use]
43 pub fn dict(&self) -> &SequenceDictionary {
44 &self.dict
45 }
46
47 pub fn load_contig(&mut self, contig_name: &str, rng: &mut impl Rng) -> Result<Vec<u8>> {
70 let meta = self
71 .dict
72 .get_by_name(contig_name)
73 .ok_or_else(|| anyhow!("Contig '{contig_name}' not found in reference"))?;
74 let expected_len = meta.length();
75
76 let region = Region::new(contig_name, ..);
77 let pos =
78 self.reader.index().query(®ion).with_context(|| {
79 format!("Failed to look up contig '{contig_name}' in FASTA index")
80 })?;
81 self.reader
82 .get_mut()
83 .seek(SeekFrom::Start(pos))
84 .with_context(|| format!("Failed to seek to contig '{contig_name}' in FASTA"))?;
85
86 let mut seq = Vec::with_capacity(expected_len);
87 self.reader
88 .read_sequence(&mut seq)
89 .with_context(|| format!("Failed to read contig '{contig_name}' from FASTA"))?;
90
91 normalize_bases(&mut seq, contig_name, rng)?;
92 Ok(seq)
93 }
94
95 #[must_use]
97 pub fn contig_names(&self) -> Vec<&str> {
98 self.dict.names()
99 }
100}
101
102fn resolve_iupac(code: u8, rng: &mut impl Rng) -> Option<u8> {
108 let alts: &[u8] = match code {
110 b'R' => b"AG",
111 b'Y' => b"CT",
112 b'S' => b"CG",
113 b'W' => b"AT",
114 b'K' => b"GT",
115 b'M' => b"AC",
116 b'B' => b"CGT",
117 b'D' => b"AGT",
118 b'H' => b"ACT",
119 b'V' => b"ACG",
120 b'N' => b"ACGT",
121 _ => return None,
122 };
123 Some(alts[rng.random_range(0..alts.len())])
124}
125
126fn normalize_bases(seq: &mut [u8], contig_name: &str, rng: &mut impl Rng) -> Result<()> {
132 for (i, b) in seq.iter_mut().enumerate() {
133 let upper = b.to_ascii_uppercase();
134 match upper {
135 b'A' | b'C' | b'G' | b'T' => *b = upper,
136 b'U' => *b = b'T',
137 b'R' | b'Y' | b'S' | b'W' | b'K' | b'M' | b'B' | b'D' | b'H' | b'V' | b'N' => {
138 let picked = resolve_iupac(upper, rng).expect("code is a valid IUPAC code");
140 *b = picked.to_ascii_lowercase();
141 }
142 _ => {
143 return Err(anyhow!(
144 "Contig '{contig_name}' contains unrecognized base {:?} (0x{:02X}) at position {i}",
145 char::from(*b),
146 *b
147 ));
148 }
149 }
150 }
151 Ok(())
152}
153
154#[cfg(test)]
162#[must_use]
163pub fn write_test_fasta(dir: &std::path::Path, contigs: &[(&str, &[u8])]) -> std::path::PathBuf {
164 use std::io::Write;
165
166 let fasta_path = dir.join("ref.fa");
167 let fai_path = dir.join("ref.fa.fai");
168
169 let mut fa = std::fs::File::create(&fasta_path).unwrap();
170 let mut fai = std::fs::File::create(&fai_path).unwrap();
171
172 let mut offset: u64 = 0;
173 for &(name, seq) in contigs {
174 let header = format!(">{name}\n");
176 fa.write_all(header.as_bytes()).unwrap();
177 offset += header.len() as u64;
178
179 let seq_offset = offset;
180 let line_len = 80;
182 for chunk in seq.chunks(line_len) {
183 fa.write_all(chunk).unwrap();
184 fa.write_all(b"\n").unwrap();
185 }
186
187 let seq_len = seq.len();
188 let line_bases = line_len;
189 let line_width = line_len + 1; writeln!(fai, "{name}\t{seq_len}\t{seq_offset}\t{line_bases}\t{line_width}").unwrap();
192
193 let full_lines = seq_len / line_len;
195 let remainder = seq_len % line_len;
196 offset += (full_lines * line_width) as u64;
197 if remainder > 0 {
198 offset += (remainder + 1) as u64;
199 }
200 }
201
202 fasta_path
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208 use rand::SeedableRng;
209 use rand::rngs::SmallRng;
210
211 fn test_rng() -> SmallRng {
213 SmallRng::seed_from_u64(42)
214 }
215
216 #[test]
217 fn test_load_contig_single() {
218 let dir = tempfile::tempdir().unwrap();
219 let path = write_test_fasta(dir.path(), &[("chr1", b"ACGTACGT")]);
220 let mut fasta = Fasta::from_path(&path).unwrap();
221
222 assert_eq!(fasta.contig_names(), vec!["chr1"]);
223 assert_eq!(fasta.dict().len(), 1);
224
225 let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
226 assert_eq!(&seq, b"ACGTACGT");
227 }
228
229 #[test]
230 fn test_load_contig_multiple() {
231 let dir = tempfile::tempdir().unwrap();
232 let path = write_test_fasta(
233 dir.path(),
234 &[("chr1", b"AAAA"), ("chr2", b"CCCCGGGG"), ("chr3", b"TT")],
235 );
236 let mut fasta = Fasta::from_path(&path).unwrap();
237
238 assert_eq!(fasta.contig_names(), vec!["chr1", "chr2", "chr3"]);
239
240 let seq1 = fasta.load_contig("chr1", &mut test_rng()).unwrap();
241 assert_eq!(&seq1, b"AAAA");
242
243 let seq2 = fasta.load_contig("chr2", &mut test_rng()).unwrap();
244 assert_eq!(&seq2, b"CCCCGGGG");
245
246 let seq3 = fasta.load_contig("chr3", &mut test_rng()).unwrap();
247 assert_eq!(&seq3, b"TT");
248 }
249
250 #[test]
251 fn test_load_contig_out_of_order() {
252 let dir = tempfile::tempdir().unwrap();
253 let path = write_test_fasta(dir.path(), &[("chr1", b"AAAA"), ("chr2", b"CCCC")]);
254 let mut fasta = Fasta::from_path(&path).unwrap();
255
256 let seq2 = fasta.load_contig("chr2", &mut test_rng()).unwrap();
258 assert_eq!(&seq2, b"CCCC");
259
260 let seq1 = fasta.load_contig("chr1", &mut test_rng()).unwrap();
261 assert_eq!(&seq1, b"AAAA");
262 }
263
264 #[test]
265 fn test_load_contig_uppercases_acgt() {
266 let dir = tempfile::tempdir().unwrap();
267 let path = write_test_fasta(dir.path(), &[("chr1", b"acgtACGT")]);
268 let mut fasta = Fasta::from_path(&path).unwrap();
269
270 let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
271 assert_eq!(&seq, b"ACGTACGT");
272 }
273
274 #[test]
275 fn test_load_contig_u_to_t() {
276 let dir = tempfile::tempdir().unwrap();
277 let path = write_test_fasta(dir.path(), &[("chr1", b"AUGCUu")]);
278 let mut fasta = Fasta::from_path(&path).unwrap();
279
280 let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
281 assert_eq!(&seq, b"ATGCTT");
282 }
283
284 #[test]
285 fn test_load_contig_unknown() {
286 let dir = tempfile::tempdir().unwrap();
287 let path = write_test_fasta(dir.path(), &[("chr1", b"ACGT")]);
288 let mut fasta = Fasta::from_path(&path).unwrap();
289
290 let result = fasta.load_contig("chrZ", &mut test_rng());
291 assert!(result.is_err());
292 assert!(result.unwrap_err().to_string().contains("not found"));
293 }
294
295 #[test]
296 fn test_load_contig_rejects_unknown_byte() {
297 let dir = tempfile::tempdir().unwrap();
298 let path = write_test_fasta(dir.path(), &[("chr1", b"ACZT")]);
299 let mut fasta = Fasta::from_path(&path).unwrap();
300
301 let err = fasta.load_contig("chr1", &mut test_rng()).unwrap_err();
302 let msg = err.to_string();
303 assert!(msg.contains("unrecognized base"), "message was: {msg}");
304 assert!(msg.contains("position 2"), "message was: {msg}");
305 }
306
307 #[test]
308 fn test_load_contig_iupac_resolved_to_lowercase() {
309 let dir = tempfile::tempdir().unwrap();
310 let path = write_test_fasta(dir.path(), &[("chr1", b"ARYSWKMBDHVN")]);
311 let mut fasta = Fasta::from_path(&path).unwrap();
312
313 let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
314 assert_eq!(seq.len(), 12);
315 assert_eq!(seq[0], b'A');
317 for (i, &b) in seq.iter().enumerate().skip(1) {
319 assert!(matches!(b, b'a' | b'c' | b'g' | b't'), "position {i} got {b:?}");
320 }
321 }
322
323 #[test]
324 fn test_load_contig_resolution_respects_ambiguity_set() {
325 let dir = tempfile::tempdir().unwrap();
326 let path = write_test_fasta(
328 dir.path(),
329 &[("chr1", &[b'R'; 200]), ("chr2", &[b'Y'; 200]), ("chr3", &[b'K'; 200])],
330 );
331 let mut fasta = Fasta::from_path(&path).unwrap();
332
333 let r = fasta.load_contig("chr1", &mut test_rng()).unwrap();
334 let y = fasta.load_contig("chr2", &mut test_rng()).unwrap();
335 let k = fasta.load_contig("chr3", &mut test_rng()).unwrap();
336
337 for &b in &r {
339 assert!(matches!(b, b'a' | b'g'), "R resolved to {b:?}");
340 }
341 for &b in &y {
343 assert!(matches!(b, b'c' | b't'), "Y resolved to {b:?}");
344 }
345 for &b in &k {
347 assert!(matches!(b, b'g' | b't'), "K resolved to {b:?}");
348 }
349 }
350
351 #[test]
352 fn test_load_contig_resolution_reproducible() {
353 let dir = tempfile::tempdir().unwrap();
354 let path = write_test_fasta(dir.path(), &[("chr1", b"NNNNNNNNNNRRYYSSWWKKMMBBDDHHVV")]);
355 let mut fasta_a = Fasta::from_path(&path).unwrap();
356 let mut fasta_b = Fasta::from_path(&path).unwrap();
357
358 let a = fasta_a.load_contig("chr1", &mut SmallRng::seed_from_u64(1234)).unwrap();
359 let b = fasta_b.load_contig("chr1", &mut SmallRng::seed_from_u64(1234)).unwrap();
360 assert_eq!(a, b);
361 }
362}