1use std::fmt::{self, Write};
46use std::fs::File;
47use std::io::{self, BufRead, BufReader, Read};
48use std::path::Path;
49
50use indexmap::IndexSet;
51use memmap2::{Mmap, MmapOptions};
52
53#[derive(Debug, Clone)]
56pub struct Fai {
57 chromosomes: Vec<FaiRecord>,
58 name_map: IndexSet<String>,
59}
60
61impl Fai {
62 pub fn from_file<P: AsRef<Path>>(path: P) -> io::Result<Self> {
64 let f = File::open(path)?;
65 let br = BufReader::new(f);
66
67 let mut name_map = IndexSet::new();
68 let mut chromosomes = Vec::new();
69
70 for l in br.lines() {
71 let line = l?;
72 let p: Vec<_> = line.split('\t').collect();
73
74 if p.len() != 5 {
75 return Err(io::Error::new(
76 io::ErrorKind::InvalidData,
77 "Expected 5 columns in .fai file.",
78 ));
79 }
80
81 name_map.insert(p[0].to_owned());
82
83 let ioerr =
84 |e, msg| io::Error::new(io::ErrorKind::InvalidData, format!("{}:{}", msg, e));
85 chromosomes.push(FaiRecord {
86 len: p[1]
87 .parse()
88 .map_err(|e| ioerr(e, "Error parsing chr len in .fai"))?,
89 offset: p[2]
90 .parse()
91 .map_err(|e| ioerr(e, "Error parsing chr offset in .fai"))?,
92 line_bases: p[3]
93 .parse()
94 .map_err(|e| ioerr(e, "Error parsing chr line_bases in .fai"))?,
95 line_width: p[4]
96 .parse()
97 .map_err(|e| ioerr(e, "Error parsing chr line_width in .fai"))?,
98 });
99 }
100
101 Ok(Fai {
102 chromosomes,
103 name_map,
104 })
105 }
106
107 #[inline]
113 pub fn offset(&self, tid: usize, start: usize, stop: usize) -> io::Result<(usize, usize)> {
114 let chr = &self.chromosomes.get(tid).ok_or_else(|| {
115 io::Error::other("Chromomsome tid was out of bounds")
116 })?;
117 if stop > chr.len {
118 return Err(io::Error::other("FASTA read interval was out of bounds"));
119 }
120
121 let start_offset =
122 chr.offset + (start / chr.line_bases) * chr.line_width + start % chr.line_bases;
123 let stop_offset =
124 chr.offset + (stop / chr.line_bases) * chr.line_width + stop % chr.line_bases;
125
126 Ok((start_offset, stop_offset))
127 }
128
129 #[inline]
134 pub fn offset_tid(&self, tid: usize) -> io::Result<(usize, usize)> {
135 let chr = &self.chromosomes.get(tid).ok_or_else(|| {
136 io::Error::other("Chromomsome tid was out of bounds")
137 })?;
138 let start_offset = chr.offset;
139 let stop_offset =
140 chr.offset + (chr.len / chr.line_bases) * chr.line_width + chr.len % chr.line_bases;
141 Ok((start_offset, stop_offset))
142 }
143
144 #[inline]
148 pub fn tid(&self, name: &str) -> Option<usize> {
149 self.name_map.get_index_of(name)
150 }
151
152 pub fn size(&self, tid: usize) -> io::Result<usize> {
156 let chr = &self.chromosomes.get(tid).ok_or_else(|| {
157 io::Error::other("Chromomsome tid was out of bounds")
158 })?;
159 Ok(chr.len)
160 }
161
162 pub fn name(&self, tid: usize) -> io::Result<&String> {
164 self.name_map.get_index(tid).ok_or_else(|| {
165 io::Error::other("Chromomsome tid was out of bounds")
166 })
167 }
168
169 pub fn names(&self) -> Vec<&str> {
174 self.name_map.iter().map(|s| s.as_str()).collect()
175 }
176}
177
178#[derive(Debug, Clone)]
180pub struct FaiRecord {
181 len: usize,
182 offset: usize,
183 line_bases: usize,
184 line_width: usize,
185}
186
187pub struct IndexedFasta {
189 mmap: Mmap,
190 fasta_index: Fai,
191}
192
193impl IndexedFasta {
194 pub fn from_file<P: AsRef<Path>>(path: P) -> io::Result<Self> {
197 let mut fai_path = path.as_ref().as_os_str().to_owned();
198 fai_path.push(".fai");
199 let fasta_index = Fai::from_file(&fai_path)?;
200
201 let file = File::open(path)?;
202 let mmap = unsafe { MmapOptions::new().map(&file)? };
203 Ok(IndexedFasta { mmap, fasta_index })
204 }
205
206 pub fn view(&self, tid: usize, start: usize, stop: usize) -> io::Result<FastaView<'_>> {
211 if start > stop {
212 return Err(io::Error::other("Invalid query interval"));
213 }
214
215 let (start_byte, stop_byte) = self.fasta_index.offset(tid, start, stop)?;
216 Ok(FastaView(&self.mmap[start_byte..stop_byte]))
218 }
219
220 pub fn view_tid(&self, tid: usize) -> io::Result<FastaView<'_>> {
224 let (start_byte, stop_byte) = self.fasta_index.offset_tid(tid)?;
225 Ok(FastaView(&self.mmap[start_byte..stop_byte]))
227 }
228
229 pub fn fai(&self) -> &Fai {
233 &self.fasta_index
234 }
235}
236
237pub struct FastaView<'a>(&'a [u8]);
239
240impl<'a> FastaView<'a> {
241 pub fn count_bases(&self) -> BaseCounts {
246 let mut bc: BaseCounts = Default::default();
247
248 for b in self.bases() {
249 let v: u8 = b << 3;
250 if v ^ 8 == 0 {
251 bc.a += 1;
252 } else if v ^ 24 == 0 {
253 bc.c += 1;
254 } else if v ^ 56 == 0 {
255 bc.g += 1;
256 } else if v ^ 112 == 0 {
257 bc.n += 1;
258 } else if v ^ 160 == 0 {
259 bc.t += 1;
260 } else {
261 bc.other += 1;
262 }
263 }
264
265 bc
266 }
267
268 pub fn bases(&self) -> impl Iterator<Item = &'a u8> {
272 self.0.iter().filter(|&&b| b & 192 == 64)
273 }
274}
275
276impl<'a> std::fmt::Display for FastaView<'a> {
278 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
279 self.bases().try_for_each(|&c| f.write_char(c as char))
280 }
281}
282
283impl<'a> Read for FastaView<'a> {
284 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
285 let mut read = 0;
286 let mut skipped = 0;
287 for (t, s) in buf.iter_mut().zip(self.0.iter().filter(|&&c| {
288 let base = c & 192 == 64;
289 if !base {
290 skipped += 1;
291 }
292 base
293 })) {
294 *t = *s;
295 read += 1;
296 }
297 self.0 = &self.0[(skipped + read)..];
298 Ok(read)
299 }
300}
301
302#[derive(Debug, Clone, Eq, PartialEq)]
305pub struct BaseCounts {
306 pub a: usize,
307 pub c: usize,
308 pub g: usize,
309 pub t: usize,
310 pub n: usize,
311 pub other: usize,
312}
313
314impl Default for BaseCounts {
316 fn default() -> BaseCounts {
317 BaseCounts {
318 a: 0,
319 c: 0,
320 g: 0,
321 t: 0,
322 n: 0,
323 other: 0,
324 }
325 }
326}
327
328#[cfg(test)]
329mod tests {
330 use super::*;
331
332 #[test]
333 fn fai() {
334 let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
335 assert_eq!(ir.fai().names().len(), 3);
336 assert_eq!(ir.fai().tid("ACGT-25"), Some(2));
337 assert_eq!(ir.fai().tid("NotFound"), None);
338
339 assert_eq!(ir.fai().size(2).unwrap(), 100);
340 assert_eq!(ir.fai().name(2).unwrap(), "ACGT-25");
341 assert!(ir.fai().name(3).is_err());
342 }
343
344 #[test]
345 fn view() {
346 let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
347 assert_eq!(ir.view(0, 0, 10).unwrap().to_string(), "AAAAAAAAAA");
348 assert!(ir.view(0, 0, 11).is_err());
349
350 assert_eq!(
351 ir.view(2, 38, 62).unwrap().to_string(),
352 "CCCCCCCCCCCCGGGGGGGGGGGG"
353 );
354 assert_eq!(
355 ir.view(2, 74, 100).unwrap().to_string(),
356 "GTTTTTTTTTTTTTTTTTTTTTTTTT"
357 );
358 assert!(ir.view(0, 120, 130).is_err());
359 }
360
361 #[test]
362 fn view_tid() {
363 let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
364 assert_eq!(ir.view_tid(0).unwrap().to_string(), "AAAAAAAAAA");
365 assert_eq!(ir.view_tid(1).unwrap().to_string(),
366 "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
367 assert_eq!(ir.view_tid(2).unwrap().to_string(),
368 "AAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTT");
369 assert!(ir.view_tid(3).is_err());
370 }
371
372 #[test]
373 fn view_bases() {
374 let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
375 let v = ir.view(2, 48, 52).unwrap();
376 let mut b = v.bases();
377 assert_eq!(b.next(), Some(&b'C'));
378 assert_eq!(b.next(), Some(&b'C'));
379 assert_eq!(b.next(), Some(&b'G'));
380 assert_eq!(b.next(), Some(&b'G'));
381 assert_eq!(b.next(), None);
382 }
383
384 #[test]
385 fn view_counts() {
386 let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
387 assert_eq!(
388 ir.view(2, 48, 52).unwrap().count_bases(),
389 BaseCounts {
390 c: 2,
391 g: 2,
392 ..Default::default()
393 }
394 );
395 }
396
397 #[test]
398 fn read_view() {
399 let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
400 let mut buf = vec![0; 25];
401 let mut v = ir.view_tid(2).unwrap();
402 println!("{}", v.to_string());
403 assert_eq!(v.read(&mut buf).unwrap(), 25);
404 assert_eq!(buf, vec![b'A'; 25]);
405 assert_eq!(v.read(&mut buf).unwrap(), 25);
406 assert_eq!(buf, vec![b'C'; 25]);
407 assert_eq!(v.read(&mut buf).unwrap(), 25);
408 assert_eq!(buf, vec![b'G'; 25]);
409
410 let mut buf2 = vec![0; 10];
411 assert_eq!(v.read(&mut buf2).unwrap(), 10);
412 assert_eq!(buf2, vec![b'T'; 10]);
413 assert_eq!(v.read(&mut buf2).unwrap(), 10);
414 assert_eq!(buf2, vec![b'T'; 10]);
415 assert_eq!(v.read(&mut buf2).unwrap(), 5);
416 assert_eq!(&buf2[0..5], vec![b'T'; 5].as_slice());
417 }
418}