faimm/
lib.rs

1//! This crate provides indexed fasta access by using a memory mapped file to read the sequence
2//! data. It is intended for accessing sequence data on genome sized fasta files and provides
3//! random access based on base coordinates. Because an indexed fasta file uses a limited number of
4//! bases per line separated by (sometimes platform-specific) newlines you cannot directly use the
5//! bytes available from the mmap.
6//!
7//! Access is provided using a view of the mmap using zero-based base coordinates. This view can
8//! then be used to iterate over bases (represented as `u8`) or parsed into a string. Naive gc
9//! counting is also available.
10//!
11//! Access to the sequence data doesn't require the `IndexedFasta` to be mutable. This makes
12//! it easy to share.
13//!
14//! # Example
15//! ```
16//! use faimm::IndexedFasta;
17//! let fa = IndexedFasta::from_file("test/genome.fa").expect("Error opening fa");
18//! let chr_index = fa.fai().tid("ACGT-25").expect("Cannot find chr in index");
19//! let v = fa.view(chr_index,0,50).expect("Cannot get .fa view");
20//! //count the bases
21//! let counts = v.count_bases();
22//! //or print the sequence
23//! println!("{}", v.to_string());
24//! ```
25//! # Limitations
26//! The parser uses a simple ascii mask for allowable characters (64..128), does not apply any
27//! IUPAC converson or validation. Anything outside this range is silently skipped. This means that
28//! also invalid `fasta` will be parsed. The mere presence of an accompanying `.fai` provides the
29//! assumption of a valid fasta.
30//! Requires Rust >=1.64
31//!
32//! # Alternatives
33//! [Rust-bio](https://crates.io/crates/bio) provides a competent indexed fasta reader. The major
34//! difference is that it has an internal buffer an therefore needs to be mutable when performing
35//! read operations. faimm is also faster. If you want record based access (without an .fai index
36//! file) [rust-bio](https://crates.io/crates/bio) or [seq_io](https://crates.io/crates/seq_io)
37//! provide this.
38//!
39//! # Performance
40//! Calculating the gc content of target regions of an exome (231_410 regions) on the Human
41//! reference (GRCh38) takes about 0.7 seconds (warm cache), slightly faster than bedtools nuc (0.9s probably a more
42//! sound implementation) and rust-bio (1.3s same implementation as example)
43//! Some tests show counting can also be improved using simd, but nothing has been released.
44
45use std::fmt::{self, Write};
46use std::fs::File;
47use std::io::{self, BufRead, BufReader, Read};
48use std::path::Path;
49
50use indexmap::IndexSet;
51use memmap2::{Mmap, MmapOptions};
52
53/// The object that stores the parsed fasta index file. You can use it to map chromosome names to
54/// indexes and lookup offsets for chr-start:end coordinates
55#[derive(Debug, Clone)]
56pub struct Fai {
57    chromosomes: Vec<FaiRecord>,
58    name_map: IndexSet<String>,
59}
60
61impl Fai {
62    /// Open a fasta index file from path `P`.
63    pub fn from_file<P: AsRef<Path>>(path: P) -> io::Result<Self> {
64        let f = File::open(path)?;
65        let br = BufReader::new(f);
66
67        let mut name_map = IndexSet::new();
68        let mut chromosomes = Vec::new();
69
70        for l in br.lines() {
71            let line = l?;
72            let p: Vec<_> = line.split('\t').collect();
73
74            if p.len() != 5 {
75                return Err(io::Error::new(
76                    io::ErrorKind::InvalidData,
77                    "Expected 5 columns in .fai file.",
78                ));
79            }
80
81            name_map.insert(p[0].to_owned());
82
83            let ioerr =
84                |e, msg| io::Error::new(io::ErrorKind::InvalidData, format!("{}:{}", msg, e));
85            chromosomes.push(FaiRecord {
86                len: p[1]
87                    .parse()
88                    .map_err(|e| ioerr(e, "Error parsing chr len in .fai"))?,
89                offset: p[2]
90                    .parse()
91                    .map_err(|e| ioerr(e, "Error parsing chr offset in .fai"))?,
92                line_bases: p[3]
93                    .parse()
94                    .map_err(|e| ioerr(e, "Error parsing chr line_bases in .fai"))?,
95                line_width: p[4]
96                    .parse()
97                    .map_err(|e| ioerr(e, "Error parsing chr line_width in .fai"))?,
98            });
99        }
100
101        Ok(Fai {
102            chromosomes,
103            name_map,
104        })
105    }
106
107    /// Calculate the slice coordinates (byte offsets).
108    /// tid is the index of the chromosome (lookup with `Fai::tid` if necessary.
109    /// start, end: zero based coordinates of the requested range.
110    ///
111    /// Returns an tuple (start, end) if successful. `io::Error` otherwise.
112    #[inline]
113    pub fn offset(&self, tid: usize, start: usize, stop: usize) -> io::Result<(usize, usize)> {
114        let chr = &self.chromosomes.get(tid).ok_or_else(|| {
115            io::Error::other("Chromomsome tid was out of bounds")
116        })?;
117        if stop > chr.len {
118            return Err(io::Error::other("FASTA read interval was out of bounds"));
119        }
120
121        let start_offset =
122            chr.offset + (start / chr.line_bases) * chr.line_width + start % chr.line_bases;
123        let stop_offset =
124            chr.offset + (stop / chr.line_bases) * chr.line_width + stop % chr.line_bases;
125
126        Ok((start_offset, stop_offset))
127    }
128
129    /// Calculate the slice coordinates (byte offsets).
130    /// tid is the index of the chromosome (lookup with `Fai::tid` if necessary.
131    ///
132    /// Returns an tuple (start, end) if successful. `io::Error` otherwise.
133    #[inline]
134    pub fn offset_tid(&self, tid: usize) -> io::Result<(usize, usize)> {
135        let chr = &self.chromosomes.get(tid).ok_or_else(|| {
136            io::Error::other("Chromomsome tid was out of bounds")
137        })?;
138        let start_offset = chr.offset;
139        let stop_offset =
140            chr.offset + (chr.len / chr.line_bases) * chr.line_width + chr.len % chr.line_bases;
141        Ok((start_offset, stop_offset))
142    }
143
144    /// Return the index of the chromosome by name in the fasta index.
145    ///
146    /// Returns the position of chr `name` if succesful, None otherwise.
147    #[inline]
148    pub fn tid(&self, name: &str) -> Option<usize> {
149        self.name_map.get_index_of(name)
150    }
151
152    /// Return the index of a chromosome in the fasta index.
153    ///
154    /// Returns the size in bases as usize.
155    pub fn size(&self, tid: usize) -> io::Result<usize> {
156        let chr = &self.chromosomes.get(tid).ok_or_else(|| {
157            io::Error::other("Chromomsome tid was out of bounds")
158        })?;
159        Ok(chr.len)
160    }
161
162    /// Return the name of the chromomsome at index tid
163    pub fn name(&self, tid: usize) -> io::Result<&String> {
164        self.name_map.get_index(tid).ok_or_else(|| {
165            io::Error::other("Chromomsome tid was out of bounds")
166        })
167    }
168
169    /// Return the names of the chromosomes from the fasta index in the same order as in the
170    /// `.fai`. You can use `Fai::tid` to map it back to an index.
171    ///
172    /// Returns a `Vec<&str>` with the chromosome names.
173    pub fn names(&self) -> Vec<&str> {
174        self.name_map.iter().map(|s| s.as_str()).collect()
175    }
176}
177
178/// FaiRecord stores the length, offset, and fasta file characterics of a single chromosome
179#[derive(Debug, Clone)]
180pub struct FaiRecord {
181    len: usize,
182    offset: usize,
183    line_bases: usize,
184    line_width: usize,
185}
186
187/// The `IndexFasta` can be used to open a fasta file that has a valid .fai index file.
188pub struct IndexedFasta {
189    mmap: Mmap,
190    fasta_index: Fai,
191}
192
193impl IndexedFasta {
194    /// Open a fasta file from path `P`. It is assumed that it has a valid .fai index file. The
195    /// .fai file is created by appending .fai to the fasta file.
196    pub fn from_file<P: AsRef<Path>>(path: P) -> io::Result<Self> {
197        let mut fai_path = path.as_ref().as_os_str().to_owned();
198        fai_path.push(".fai");
199        let fasta_index = Fai::from_file(&fai_path)?;
200
201        let file = File::open(path)?;
202        let mmap = unsafe { MmapOptions::new().map(&file)? };
203        Ok(IndexedFasta { mmap, fasta_index })
204    }
205
206    /// Use tid, start and end to calculate a slice on the Fasta file. Use this view to iterate
207    /// over the bases.
208    ///
209    /// Returns FastaView for the provided chromsome, start, end if successful, Error otherwise.
210    pub fn view(&self, tid: usize, start: usize, stop: usize) -> io::Result<FastaView<'_>> {
211        if start > stop {
212            return Err(io::Error::other("Invalid query interval"));
213        }
214
215        let (start_byte, stop_byte) = self.fasta_index.offset(tid, start, stop)?;
216        //println!("offset for chr {}:{}-{} is {}-{}", tid, start, stop, start_byte, stop_byte);
217        Ok(FastaView(&self.mmap[start_byte..stop_byte]))
218    }
219
220    /// Use tid to return a view of an entire chromosome.
221    ///
222    /// Returns FastaView for the provided chromsome indicated by tid if successful, Error otherwise.
223    pub fn view_tid(&self, tid: usize) -> io::Result<FastaView<'_>> {
224        let (start_byte, stop_byte) = self.fasta_index.offset_tid(tid)?;
225        //println!("offset for chr {}:{}-{} is {}-{}", tid, start, stop, start_byte, stop_byte);
226        Ok(FastaView(&self.mmap[start_byte..stop_byte]))
227    }
228
229    /// Return a reference to the `Fai` that contains information from the fasta index.
230    ///
231    /// Returns a reference to `Fai`.
232    pub fn fai(&self) -> &Fai {
233        &self.fasta_index
234    }
235}
236
237/// A view of a slice of the fasta file bounded by provided coordinates
238pub struct FastaView<'a>(&'a [u8]);
239
240impl<'a> FastaView<'a> {
241    /// Count the occurences of A, C, G, T, N, and other in the current view. This function does
242    /// not differentiate between upper or lower case bases.
243    ///
244    /// Returns a `BasecCounts` object.
245    pub fn count_bases(&self) -> BaseCounts {
246        let mut bc: BaseCounts = Default::default();
247
248        for b in self.bases() {
249            let v: u8 = b << 3;
250            if v ^ 8 == 0 {
251                bc.a += 1;
252            } else if v ^ 24 == 0 {
253                bc.c += 1;
254            } else if v ^ 56 == 0 {
255                bc.g += 1;
256            } else if v ^ 112 == 0 {
257                bc.n += 1;
258            } else if v ^ 160 == 0 {
259                bc.t += 1;
260            } else {
261                bc.other += 1;
262            }
263        }
264
265        bc
266    }
267
268    /// Iterator over the bases in the current view. Bases are returned as `u8` representations of
269    /// the `char`s in the fasta file. Keep only that chars between 164 and 128 (effectively
270    /// skipping newlines)
271    pub fn bases(&self) -> impl Iterator<Item = &'a u8> {
272        self.0.iter().filter(|&&b| b & 192 == 64)
273    }
274}
275
276/// Returns a newly allocated, utf8-validated string with the sequence data in `Self`
277impl<'a> std::fmt::Display for FastaView<'a> {
278    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
279        self.bases().try_for_each(|&c| f.write_char(c as char))
280    }
281}
282
283impl<'a> Read for FastaView<'a> {
284    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
285        let mut read = 0;
286        let mut skipped = 0;
287        for (t, s) in buf.iter_mut().zip(self.0.iter().filter(|&&c| {
288            let base = c & 192 == 64;
289            if !base {
290                skipped += 1;
291            }
292            base
293        })) {
294            *t = *s;
295            read += 1;
296        }
297        self.0 = &self.0[(skipped + read)..];
298        Ok(read)
299    }
300}
301
302/// Object that contains count occurrences of the most common bases in DNA genome references: A, C, G,
303/// T, N and other.
304#[derive(Debug, Clone, Eq, PartialEq)]
305pub struct BaseCounts {
306    pub a: usize,
307    pub c: usize,
308    pub g: usize,
309    pub t: usize,
310    pub n: usize,
311    pub other: usize,
312}
313
314/// Initialize basecount with zeros
315impl Default for BaseCounts {
316    fn default() -> BaseCounts {
317        BaseCounts {
318            a: 0,
319            c: 0,
320            g: 0,
321            t: 0,
322            n: 0,
323            other: 0,
324        }
325    }
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn fai() {
334        let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
335        assert_eq!(ir.fai().names().len(), 3);
336        assert_eq!(ir.fai().tid("ACGT-25"), Some(2));
337        assert_eq!(ir.fai().tid("NotFound"), None);
338
339        assert_eq!(ir.fai().size(2).unwrap(), 100);
340        assert_eq!(ir.fai().name(2).unwrap(), "ACGT-25");
341        assert!(ir.fai().name(3).is_err());
342    }
343
344    #[test]
345    fn view() {
346        let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
347        assert_eq!(ir.view(0, 0, 10).unwrap().to_string(), "AAAAAAAAAA");
348        assert!(ir.view(0, 0, 11).is_err());
349
350        assert_eq!(
351            ir.view(2, 38, 62).unwrap().to_string(),
352            "CCCCCCCCCCCCGGGGGGGGGGGG"
353        );
354        assert_eq!(
355            ir.view(2, 74, 100).unwrap().to_string(),
356            "GTTTTTTTTTTTTTTTTTTTTTTTTT"
357        );
358        assert!(ir.view(0, 120, 130).is_err());
359    }
360
361    #[test]
362    fn view_tid() {
363        let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
364        assert_eq!(ir.view_tid(0).unwrap().to_string(), "AAAAAAAAAA");
365        assert_eq!(ir.view_tid(1).unwrap().to_string(),
366            "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
367        assert_eq!(ir.view_tid(2).unwrap().to_string(),
368            "AAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTT");
369        assert!(ir.view_tid(3).is_err());
370    }
371
372    #[test]
373    fn view_bases() {
374        let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
375        let v = ir.view(2, 48, 52).unwrap();
376        let mut b = v.bases();
377        assert_eq!(b.next(), Some(&b'C'));
378        assert_eq!(b.next(), Some(&b'C'));
379        assert_eq!(b.next(), Some(&b'G'));
380        assert_eq!(b.next(), Some(&b'G'));
381        assert_eq!(b.next(), None);
382    }
383
384    #[test]
385    fn view_counts() {
386        let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
387        assert_eq!(
388            ir.view(2, 48, 52).unwrap().count_bases(),
389            BaseCounts {
390                c: 2,
391                g: 2,
392                ..Default::default()
393            }
394        );
395    }
396
397    #[test]
398    fn read_view() {
399        let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
400        let mut buf = vec![0; 25];
401        let mut v = ir.view_tid(2).unwrap();
402        println!("{}", v.to_string());
403        assert_eq!(v.read(&mut buf).unwrap(), 25);
404        assert_eq!(buf, vec![b'A'; 25]);
405        assert_eq!(v.read(&mut buf).unwrap(), 25);
406        assert_eq!(buf, vec![b'C'; 25]);
407        assert_eq!(v.read(&mut buf).unwrap(), 25);
408        assert_eq!(buf, vec![b'G'; 25]);
409
410        let mut buf2 = vec![0; 10];
411        assert_eq!(v.read(&mut buf2).unwrap(), 10);
412        assert_eq!(buf2, vec![b'T'; 10]);
413        assert_eq!(v.read(&mut buf2).unwrap(), 10);
414        assert_eq!(buf2, vec![b'T'; 10]);
415        assert_eq!(v.read(&mut buf2).unwrap(), 5);
416        assert_eq!(&buf2[0..5], vec![b'T'; 5].as_slice());
417    }
418}