1use std::fs::File;
2use std::io::{Seek, SeekFrom};
3use std::path::Path;
4
5use anyhow::{Context, Result, anyhow};
6use noodles::core::Region;
7use noodles::fasta;
8
9use crate::sequence_dict::SequenceDictionary;
10
11pub struct Fasta {
17 reader: fasta::io::IndexedReader<fasta::io::BufReader<File>>,
19 dict: SequenceDictionary,
21}
22
23impl Fasta {
24 pub fn from_path(path: &Path) -> Result<Self> {
31 let reader = fasta::io::indexed_reader::Builder::default()
32 .build_from_path(path)
33 .with_context(|| format!("Failed to open indexed FASTA: {}", path.display()))?;
34
35 let dict = SequenceDictionary::from(reader.index());
36
37 Ok(Self { reader, dict })
38 }
39
40 #[must_use]
42 pub fn dict(&self) -> &SequenceDictionary {
43 &self.dict
44 }
45
46 pub fn load_contig(&mut self, contig_name: &str) -> Result<Vec<u8>> {
54 let meta = self
55 .dict
56 .get_by_name(contig_name)
57 .ok_or_else(|| anyhow!("Contig '{contig_name}' not found in reference"))?;
58 let expected_len = meta.length();
59
60 let region = Region::new(contig_name, ..);
61 let pos =
62 self.reader.index().query(®ion).with_context(|| {
63 format!("Failed to look up contig '{contig_name}' in FASTA index")
64 })?;
65 self.reader
66 .get_mut()
67 .seek(SeekFrom::Start(pos))
68 .with_context(|| format!("Failed to seek to contig '{contig_name}' in FASTA"))?;
69
70 let mut seq = Vec::with_capacity(expected_len);
71 self.reader
72 .read_sequence(&mut seq)
73 .with_context(|| format!("Failed to read contig '{contig_name}' from FASTA"))?;
74
75 for b in &mut seq {
77 *b = b.to_ascii_uppercase();
78 }
79 Ok(seq)
80 }
81
82 #[must_use]
84 pub fn contig_names(&self) -> Vec<&str> {
85 self.dict.names()
86 }
87}
88
89#[cfg(test)]
97#[must_use]
98pub fn write_test_fasta(dir: &std::path::Path, contigs: &[(&str, &[u8])]) -> std::path::PathBuf {
99 use std::io::Write;
100
101 let fasta_path = dir.join("ref.fa");
102 let fai_path = dir.join("ref.fa.fai");
103
104 let mut fa = std::fs::File::create(&fasta_path).unwrap();
105 let mut fai = std::fs::File::create(&fai_path).unwrap();
106
107 let mut offset: u64 = 0;
108 for &(name, seq) in contigs {
109 let header = format!(">{name}\n");
111 fa.write_all(header.as_bytes()).unwrap();
112 offset += header.len() as u64;
113
114 let seq_offset = offset;
115 let line_len = 80;
117 for chunk in seq.chunks(line_len) {
118 fa.write_all(chunk).unwrap();
119 fa.write_all(b"\n").unwrap();
120 }
121
122 let seq_len = seq.len();
123 let line_bases = line_len;
124 let line_width = line_len + 1; writeln!(fai, "{name}\t{seq_len}\t{seq_offset}\t{line_bases}\t{line_width}").unwrap();
127
128 let full_lines = seq_len / line_len;
130 let remainder = seq_len % line_len;
131 offset += (full_lines * line_width) as u64;
132 if remainder > 0 {
133 offset += (remainder + 1) as u64;
134 }
135 }
136
137 fasta_path
138}
139
140#[cfg(test)]
141mod tests {
142 use super::*;
143
144 #[test]
145 fn test_load_contig_single() {
146 let dir = tempfile::tempdir().unwrap();
147 let path = write_test_fasta(dir.path(), &[("chr1", b"ACGTACGT")]);
148 let mut fasta = Fasta::from_path(&path).unwrap();
149
150 assert_eq!(fasta.contig_names(), vec!["chr1"]);
151 assert_eq!(fasta.dict().len(), 1);
152
153 let seq = fasta.load_contig("chr1").unwrap();
154 assert_eq!(&seq, b"ACGTACGT");
155 }
156
157 #[test]
158 fn test_load_contig_multiple() {
159 let dir = tempfile::tempdir().unwrap();
160 let path = write_test_fasta(
161 dir.path(),
162 &[("chr1", b"AAAA"), ("chr2", b"CCCCGGGG"), ("chr3", b"TT")],
163 );
164 let mut fasta = Fasta::from_path(&path).unwrap();
165
166 assert_eq!(fasta.contig_names(), vec!["chr1", "chr2", "chr3"]);
167
168 let seq1 = fasta.load_contig("chr1").unwrap();
169 assert_eq!(&seq1, b"AAAA");
170
171 let seq2 = fasta.load_contig("chr2").unwrap();
172 assert_eq!(&seq2, b"CCCCGGGG");
173
174 let seq3 = fasta.load_contig("chr3").unwrap();
175 assert_eq!(&seq3, b"TT");
176 }
177
178 #[test]
179 fn test_load_contig_out_of_order() {
180 let dir = tempfile::tempdir().unwrap();
181 let path = write_test_fasta(dir.path(), &[("chr1", b"AAAA"), ("chr2", b"CCCC")]);
182 let mut fasta = Fasta::from_path(&path).unwrap();
183
184 let seq2 = fasta.load_contig("chr2").unwrap();
186 assert_eq!(&seq2, b"CCCC");
187
188 let seq1 = fasta.load_contig("chr1").unwrap();
189 assert_eq!(&seq1, b"AAAA");
190 }
191
192 #[test]
193 fn test_load_contig_uppercase() {
194 let dir = tempfile::tempdir().unwrap();
195 let path = write_test_fasta(dir.path(), &[("chr1", b"acgtNn")]);
196 let mut fasta = Fasta::from_path(&path).unwrap();
197
198 let seq = fasta.load_contig("chr1").unwrap();
199 assert_eq!(&seq, b"ACGTNN");
200 }
201
202 #[test]
203 fn test_load_contig_unknown() {
204 let dir = tempfile::tempdir().unwrap();
205 let path = write_test_fasta(dir.path(), &[("chr1", b"ACGT")]);
206 let mut fasta = Fasta::from_path(&path).unwrap();
207
208 let result = fasta.load_contig("chrZ");
209 assert!(result.is_err());
210 assert!(result.unwrap_err().to_string().contains("not found"));
211 }
212}