bio/io/
fasta.rs

1// Copyright 2014-2018 Johannes Köster, Christopher Schröder, Henning Timm.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Structs and trait to read and write files in FASTA format.
7//!
8//! # Example
9//!
10//! ## Read
11//!
12//! In this example, we parse a fasta file from stdin and compute some statistics
13//!
14//! ```no_run
15//! use bio::io::fasta;
16//! use std::io;
17//!
18//! let mut reader = fasta::Reader::new(io::stdin());
19//!
20//! let mut nb_reads = 0;
21//! let mut nb_bases = 0;
22//!
23//! for result in reader.records() {
24//!     let record = result.expect("Error during fasta record parsing");
25//!     println!("{}", record.id());
26//!
27//!     nb_reads += 1;
28//!     nb_bases += record.seq().len();
29//! }
30//!
31//! println!("Number of reads: {}", nb_reads);
32//! println!("Number of bases: {}", nb_bases);
33//! ```
34//!
35//! We can also use a `while` loop to iterate over records.
36//! This is slightly faster than the `for` loop.
37//! ```no_run
38//! use bio::io::fasta;
39//! use std::io;
40//! let mut records = fasta::Reader::new(io::stdin()).records();
41//!
42//! let mut nb_reads = 0;
43//! let mut nb_bases = 0;
44//!
45//! while let Some(Ok(record)) = records.next() {
46//!     nb_reads += 1;
47//!     nb_bases += record.seq().len();
48//! }
49//!
50//! println!("Number of reads: {}", nb_reads);
51//! println!("Number of bases: {}", nb_bases);
52//! ```
53//!
54//! ## Write
55//!
56//! In this example we generate 10 random sequences with length 100 and write them to stdout.
57//!
58//! ```
59//! use std::io;
60//! use bio::io::fasta;
61//!
62//! let mut seed = 42;
63//!
64//! let nucleotides = [b'A', b'C', b'G', b'T'];
65//!
66//! let mut writer = fasta::Writer::new(io::stdout());
67//!
68//! for _ in 0..10 {
69//!     let seq = (0..100).map(|_| {
70//!         seed = ((seed ^ seed << 13) ^ seed >> 7) ^ seed << 17; // don't use this random generator
71//!         nucleotides[seed % 4]
72//!     }).collect::<Vec<u8>>();
73//!
74//!    writer.write("random", None, seq.as_slice()).expect("Error writing record.");
75//! }
76//! ```
77//!
78//! ## Read and Write
79//!
80//! In this example we filter reads from stdin on sequence length and write them to stdout
81//!
82//! ```no_run
83//! use bio::io::fasta;
84//! use bio::io::fasta::FastaRead;
85//! use std::io;
86//!
87//! let mut reader = fasta::Reader::new(io::stdin());
88//! let mut writer = fasta::Writer::new(io::stdout());
89//! let mut record = fasta::Record::new();
90//!
91//! while let Ok(()) = reader.read(&mut record) {
92//!     if record.is_empty() {
93//!         break;
94//!     }
95//!
96//!     if record.seq().len() > 100 {
97//!         writer
98//!             .write_record(&record)
99//!             .ok()
100//!             .expect("Error writing record.");
101//!     }
102//! }
103//! ```
104//!
105//! ## Index
106//!
107//! Random access to FASTA files is facilitated by [`Index`] and [`IndexedReader`]. The FASTA files
108//! must already be indexed with [`samtools faidx`](https://www.htslib.org/doc/faidx.html).
109//!
110//! In this example, we read in the first 10 bases of the sequence named "chr1".
111//!
112//! ```rust
113//! use bio::io::fasta::IndexedReader;
114//! // create dummy files
115//! const FASTA_FILE: &[u8] = b">chr1\nGTAGGCTGAAAA\nCCCC";
116//! const FAI_FILE: &[u8] = b"chr1\t16\t6\t12\t13";
117//!
118//! let seq_name = "chr1";
119//! let start: u64 = 0; // start is 0-based, inclusive
120//! let stop: u64 = 10; // stop is 0-based, exclusive
121//!                     // load the index
122//! let mut faidx = IndexedReader::new(std::io::Cursor::new(FASTA_FILE), FAI_FILE).unwrap();
123//! // move the pointer in the index to the desired sequence and interval
124//! faidx
125//!     .fetch(seq_name, start, stop)
126//!     .expect("Couldn't fetch interval");
127//! // read the subsequence defined by the interval into a vector
128//! let mut seq = Vec::new();
129//! faidx.read(&mut seq).expect("Couldn't read the interval");
130//! assert_eq!(seq, b"GTAGGCTGAA");
131//! ```
132
133use std::cmp::min;
134use std::collections;
135use std::convert::AsRef;
136use std::fs;
137use std::io;
138use std::io::prelude::*;
139use std::path::Path;
140
141use crate::utils::{Text, TextSlice};
142use anyhow::Context;
143use std::fmt;
144
145/// Maximum size of temporary buffer used for reading indexed FASTA files.
146const MAX_FASTA_BUFFER_SIZE: usize = 512;
147
148/// Trait for FASTA readers.
149pub trait FastaRead {
150    fn read(&mut self, record: &mut Record) -> io::Result<()>;
151}
152
153/// A FASTA reader.
154#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
155pub struct Reader<B> {
156    reader: B,
157    line: String,
158}
159
160impl Reader<io::BufReader<fs::File>> {
161    /// Read FASTA from given file path.
162    pub fn from_file<P: AsRef<Path> + std::fmt::Debug>(path: P) -> anyhow::Result<Self> {
163        fs::File::open(&path)
164            .map(Reader::new)
165            .with_context(|| format!("Failed to read fasta from {:#?}", path))
166    }
167
168    /// Read FASTA from give file path and a capacity
169    pub fn from_file_with_capacity<P: AsRef<Path> + std::fmt::Debug>(
170        capacity: usize,
171        path: P,
172    ) -> anyhow::Result<Self> {
173        fs::File::open(&path)
174            .map(|file| Reader::with_capacity(capacity, file))
175            .with_context(|| format!("Failed to read fasta from {:#?}", path))
176    }
177}
178
179impl<R> Reader<io::BufReader<R>>
180where
181    R: io::Read,
182{
183    /// Create a new Fasta reader given an instance of `io::Read`.
184    ///
185    /// # Example
186    /// ```rust
187    /// # use std::io;
188    /// # use bio::io::fasta::Reader;
189    /// # fn main() {
190    /// # const fasta_file: &'static [u8] = b">id desc
191    /// # AAAA
192    /// # ";
193    /// let reader = Reader::new(fasta_file);
194    /// # }
195    /// ```
196    pub fn new(reader: R) -> Self {
197        Reader {
198            reader: io::BufReader::new(reader),
199            line: String::new(),
200        }
201    }
202
203    /// Create a new Fasta reader given a capacity and an instance of `io::Read`.
204    ///
205    /// # Example
206    /// ```rust
207    /// # use std::io;
208    /// # use bio::io::fasta::Reader;
209    /// # fn main() {
210    /// # const fasta_file: &'static [u8] = b">id desc
211    /// # AAAA
212    /// # ";
213    /// let reader = Reader::with_capacity(16384, fasta_file);
214    /// # }
215    /// ```
216    pub fn with_capacity(capacity: usize, reader: R) -> Self {
217        Reader {
218            reader: io::BufReader::with_capacity(capacity, reader),
219            line: String::new(),
220        }
221    }
222}
223
224impl<B> Reader<B>
225where
226    B: io::BufRead,
227{
228    /// Create a new Fasta reader with an object that implements `io::BufRead`.
229    ///
230    /// # Example
231    /// ```rust
232    /// # use std::io;
233    /// # use bio::io::fasta::Reader;
234    /// # fn main() {
235    /// # const fasta_file: &'static [u8] = b">id desc
236    /// # AAAA
237    /// # ";
238    /// let buffer = io::BufReader::with_capacity(16384, fasta_file);
239    /// let reader = Reader::from_bufread(buffer);
240    /// # }
241    /// ```
242    pub fn from_bufread(bufreader: B) -> Self {
243        Reader {
244            reader: bufreader,
245            line: String::new(),
246        }
247    }
248
249    /// Return an iterator over the records of this Fasta file.
250    ///
251    /// # Example
252    /// ```rust
253    /// # use std::io;
254    /// # use bio::io::fasta::Reader;
255    /// # use bio::io::fasta::Record;
256    /// # fn main() {
257    /// # const fasta_file: &'static [u8] = b">id desc
258    /// # AAAA
259    /// # ";
260    /// # let reader = Reader::new(fasta_file);
261    /// for record in reader.records() {
262    ///     let record = record.unwrap();
263    ///     assert_eq!(record.id(), "id");
264    ///     assert_eq!(record.desc().unwrap(), "desc");
265    ///     assert_eq!(record.seq().to_vec(), b"AAAA");
266    /// }
267    /// # }
268    /// ```
269    pub fn records(self) -> Records<B> {
270        Records {
271            reader: self,
272            error_has_occured: false,
273        }
274    }
275}
276
277impl<B> FastaRead for Reader<B>
278where
279    B: io::BufRead,
280{
281    /// Read the next FASTA record into the given `Record`.
282    /// An empty record indicates that no more records can be read.
283    ///
284    /// Use this method when you want to read records as fast as
285    /// possible because it allows the reuse of a `Record` allocation.
286    ///
287    /// The [records](Reader::records) iterator provides a more ergonomic
288    /// approach to accessing FASTA records.
289    ///
290    /// # Errors
291    ///
292    /// This function will return an error if the record is incomplete,
293    /// syntax is violated or any form of I/O error is encountered.
294    ///
295    /// # Example
296    ///
297    /// ```rust
298    /// use bio::io::fasta::Record;
299    /// use bio::io::fasta::{FastaRead, Reader};
300    ///
301    /// const fasta_file: &'static [u8] = b">id desc
302    /// AAAA
303    /// ";
304    /// let mut reader = Reader::new(fasta_file);
305    /// let mut record = Record::new();
306    ///
307    /// // Check for errors parsing the record
308    /// reader
309    ///     .read(&mut record)
310    ///     .expect("fasta reader: got an io::Error or could not read_line()");
311    ///
312    /// assert_eq!(record.id(), "id");
313    /// assert_eq!(record.desc().unwrap(), "desc");
314    /// assert_eq!(record.seq().to_vec(), b"AAAA");
315    /// ```
316    fn read(&mut self, record: &mut Record) -> io::Result<()> {
317        record.clear();
318        if self.line.is_empty() {
319            self.reader.read_line(&mut self.line)?;
320            if self.line.is_empty() {
321                return Ok(());
322            }
323        }
324
325        if !self.line.starts_with('>') {
326            return Err(io::Error::new(
327                io::ErrorKind::Other,
328                "Expected > at record start.",
329            ));
330        }
331        let mut header_fields = self.line[1..].trim_end().splitn(2, char::is_whitespace);
332        record.id = header_fields.next().map(|s| s.to_owned()).unwrap();
333        record.desc = header_fields.next().map(|s| s.to_owned());
334        loop {
335            self.line.clear();
336            self.reader.read_line(&mut self.line)?;
337            if self.line.is_empty() || self.line.starts_with('>') {
338                break;
339            }
340            record.seq.push_str(self.line.trim_end());
341        }
342
343        Ok(())
344    }
345}
346
347/// A FASTA index as created by SAMtools (.fai).
348#[derive(Default, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
349pub struct Index {
350    inner: Vec<IndexRecord>,
351    name_to_rid: collections::HashMap<String, usize>,
352}
353
354impl Index {
355    /// Open a FASTA index from a given `io::Read` instance.
356    pub fn new<R: io::Read>(fai: R) -> csv::Result<Self> {
357        let mut inner = vec![];
358        let mut name_to_rid = collections::HashMap::new();
359
360        let mut fai_reader = csv::ReaderBuilder::new()
361            .delimiter(b'\t')
362            .has_headers(false)
363            .from_reader(fai);
364        for (rid, row) in fai_reader.deserialize().enumerate() {
365            let record: IndexRecord = row?;
366            name_to_rid.insert(record.name.clone(), rid);
367            inner.push(record);
368        }
369        Ok(Index { inner, name_to_rid })
370    }
371
372    /// Open a FASTA index from a given file path.
373    pub fn from_file<P: AsRef<Path> + std::fmt::Debug>(path: &P) -> anyhow::Result<Self> {
374        fs::File::open(path)
375            .map_err(csv::Error::from)
376            .and_then(Self::new)
377            .with_context(|| format!("Failed to read fasta index from {:#?}", path))
378    }
379
380    /// Open a FASTA index given the corresponding FASTA file path.
381    /// That is, for ref.fasta we expect ref.fasta.fai.
382    pub fn with_fasta_file<P: AsRef<Path>>(fasta_path: &P) -> anyhow::Result<Self> {
383        let mut fai_path = fasta_path.as_ref().as_os_str().to_owned();
384        fai_path.push(".fai");
385
386        Self::from_file(&fai_path)
387    }
388
389    /// Return a vector of sequences described in the index.
390    pub fn sequences(&self) -> Vec<Sequence> {
391        // sort kv pairs by rid to preserve order
392        self.inner
393            .iter()
394            .map(|record| Sequence {
395                name: record.name.clone(),
396                len: record.len,
397            })
398            .collect()
399    }
400}
401
402/// A FASTA reader with an index as created by SAMtools (.fai).
403#[derive(Debug)]
404pub struct IndexedReader<R: io::Read + io::Seek> {
405    reader: io::BufReader<R>,
406    pub index: Index,
407    fetched_idx: Option<IndexRecord>,
408    start: Option<u64>,
409    stop: Option<u64>,
410}
411
412impl IndexedReader<fs::File> {
413    /// Read from a given file path. This assumes the index ref.fasta.fai to be
414    /// present for FASTA ref.fasta.
415    pub fn from_file<P: AsRef<Path> + std::fmt::Debug>(path: &P) -> anyhow::Result<Self> {
416        let index = Index::with_fasta_file(path)?;
417        fs::File::open(path)
418            .map(|f| Self::with_index(f, index))
419            .map_err(csv::Error::from)
420            .with_context(|| format!("Failed to read fasta from {:#?}", path))
421    }
422}
423
424impl<R: io::Read + io::Seek> IndexedReader<R> {
425    /// Read from a FASTA and its index, both given as `io::Read`. FASTA has to
426    /// be `io::Seek` in addition.
427    pub fn new<I: io::Read>(fasta: R, fai: I) -> csv::Result<Self> {
428        let index = Index::new(fai)?;
429        Ok(IndexedReader {
430            reader: io::BufReader::new(fasta),
431            index,
432            fetched_idx: None,
433            start: None,
434            stop: None,
435        })
436    }
437
438    /// Read from a FASTA and its index, the first given as `io::Read`, the
439    /// second given as index object.
440    pub fn with_index(fasta: R, index: Index) -> Self {
441        IndexedReader {
442            reader: io::BufReader::new(fasta),
443            index,
444            fetched_idx: None,
445            start: None,
446            stop: None,
447        }
448    }
449
450    /// Fetch an interval from the sequence with the given name for reading.
451    ///
452    /// `start` and `stop` are 0-based and `stop` is exclusive - i.e. `[start, stop)`
453    ///
454    /// # Example
455    ///
456    /// ```rust
457    /// use bio::io::fasta::IndexedReader;
458    /// // create dummy files
459    /// const FASTA_FILE: &[u8] = b">chr1\nGTAGGCTGAAAA\nCCCC";
460    /// const FAI_FILE: &[u8] = b"chr1\t16\t6\t12\t13";
461    ///
462    /// let seq_name = "chr1";
463    /// let start: u64 = 0; // start is 0-based, inclusive
464    /// let stop: u64 = 10; // stop is 0-based, exclusive
465    ///                     // load the index
466    /// let mut faidx = IndexedReader::new(std::io::Cursor::new(FASTA_FILE), FAI_FILE).unwrap();
467    /// // move the pointer in the index to the desired sequence and interval
468    /// faidx
469    ///     .fetch(seq_name, start, stop)
470    ///     .expect("Couldn't fetch interval");
471    /// // read the subsequence defined by the interval into a vector
472    /// let mut seq = Vec::new();
473    /// faidx.read(&mut seq).expect("Couldn't read the interval");
474    /// assert_eq!(seq, b"GTAGGCTGAA");
475    /// ```
476    ///
477    /// # Errors
478    /// If the `seq_name` does not exist within the index.
479    pub fn fetch(&mut self, seq_name: &str, start: u64, stop: u64) -> io::Result<()> {
480        let idx = self.idx(seq_name)?;
481        self.start = Some(start);
482        self.stop = Some(stop);
483        self.fetched_idx = Some(idx);
484        Ok(())
485    }
486
487    /// Fetch an interval from the sequence with the given record index for reading.
488    ///
489    /// `start` and `stop` are 0-based and `stop` is exclusive - i.e. `[start, stop)`
490    ///
491    /// # Example
492    ///
493    /// ```rust
494    /// use bio::io::fasta::IndexedReader;
495    /// // create dummy files
496    /// const FASTA_FILE: &[u8] = b">chr1\nGTAGGCTGAAAA\nCCCC";
497    /// const FAI_FILE: &[u8] = b"chr1\t16\t6\t12\t13";
498    ///
499    /// let rid: usize = 0;
500    /// let start: u64 = 0; // start is 0-based, inclusive
501    /// let stop: u64 = 10; // stop is 0-based, exclusive
502    ///                     // load the index
503    /// let mut faidx = IndexedReader::new(std::io::Cursor::new(FASTA_FILE), FAI_FILE).unwrap();
504    /// // move the pointer in the index to the desired sequence and interval
505    /// faidx
506    ///     .fetch_by_rid(rid, start, stop)
507    ///     .expect("Couldn't fetch interval");
508    /// // read the subsequence defined by the interval into a vector
509    /// let mut seq = Vec::new();
510    /// faidx.read(&mut seq).expect("Couldn't read the interval");
511    /// assert_eq!(seq, b"GTAGGCTGAA");
512    /// ```
513    ///
514    /// # Errors
515    /// If `rid` does not exist within the index.
516    pub fn fetch_by_rid(&mut self, rid: usize, start: u64, stop: u64) -> io::Result<()> {
517        let idx = self.idx_by_rid(rid)?;
518        self.start = Some(start);
519        self.stop = Some(stop);
520        self.fetched_idx = Some(idx);
521        Ok(())
522    }
523
524    /// Fetch the whole sequence with the given name for reading.
525    pub fn fetch_all(&mut self, seq_name: &str) -> io::Result<()> {
526        let idx = self.idx(seq_name)?;
527        self.start = Some(0);
528        self.stop = Some(idx.len);
529        self.fetched_idx = Some(idx);
530        Ok(())
531    }
532
533    /// Fetch the whole sequence with the given record index for reading.
534    pub fn fetch_all_by_rid(&mut self, rid: usize) -> io::Result<()> {
535        let idx = self.idx_by_rid(rid)?;
536        self.start = Some(0);
537        self.stop = Some(idx.len);
538        self.fetched_idx = Some(idx);
539        Ok(())
540    }
541
542    /// Read the fetched sequence into the given vector.
543    pub fn read(&mut self, seq: &mut Text) -> io::Result<()> {
544        let idx = self.fetched_idx.clone();
545        match (idx, self.start, self.stop) {
546            (Some(idx), Some(start), Some(stop)) => self.read_into_buffer(idx, start, stop, seq),
547            _ => Err(io::Error::new(
548                io::ErrorKind::Other,
549                "No sequence fetched for reading.",
550            )),
551        }
552    }
553
554    /// Return an iterator yielding the fetched sequence.
555    pub fn read_iter(&mut self) -> io::Result<IndexedReaderIterator<'_, R>> {
556        let idx = self.fetched_idx.clone();
557        match (idx, self.start, self.stop) {
558            (Some(idx), Some(start), Some(stop)) => self.read_into_iter(idx, start, stop),
559            _ => Err(io::Error::new(
560                io::ErrorKind::Other,
561                "No sequence fetched for reading.",
562            )),
563        }
564    }
565
566    fn read_into_buffer(
567        &mut self,
568        idx: IndexRecord,
569        start: u64,
570        stop: u64,
571        seq: &mut Text,
572    ) -> io::Result<()> {
573        if stop > idx.len {
574            return Err(io::Error::new(
575                io::ErrorKind::Other,
576                "FASTA read interval was out of bounds",
577            ));
578        } else if start > stop {
579            return Err(io::Error::new(
580                io::ErrorKind::Other,
581                "Invalid query interval",
582            ));
583        }
584
585        let mut bases_left = stop - start;
586        let mut line_offset = self.seek_to(&idx, start)?;
587
588        seq.clear();
589        while bases_left > 0 {
590            bases_left -= self.read_line(&idx, &mut line_offset, bases_left, seq)?;
591        }
592
593        Ok(())
594    }
595
596    fn read_into_iter(
597        &mut self,
598        idx: IndexRecord,
599        start: u64,
600        stop: u64,
601    ) -> io::Result<IndexedReaderIterator<'_, R>> {
602        if stop > idx.len {
603            return Err(io::Error::new(
604                io::ErrorKind::Other,
605                "FASTA read interval was out of bounds",
606            ));
607        } else if start > stop {
608            return Err(io::Error::new(
609                io::ErrorKind::Other,
610                "Invalid query interval",
611            ));
612        }
613
614        let bases_left = stop - start;
615        let line_offset = self.seek_to(&idx, start)?;
616        let capacity = min(
617            MAX_FASTA_BUFFER_SIZE,
618            min(bases_left, idx.line_bases) as usize,
619        );
620
621        Ok(IndexedReaderIterator {
622            reader: self,
623            record: idx,
624            bases_left,
625            line_offset,
626            buf: Vec::with_capacity(capacity),
627            buf_idx: 0,
628        })
629    }
630
631    /// Return the IndexRecord for the given sequence name or io::Result::Err
632    fn idx(&self, seqname: &str) -> io::Result<IndexRecord> {
633        match self.index.name_to_rid.get(seqname) {
634            Some(rid) => self.idx_by_rid(*rid),
635            None => Err(io::Error::new(
636                io::ErrorKind::Other,
637                format!("Unknown sequence name: {}.", seqname),
638            )),
639        }
640    }
641
642    /// Return the IndexRecord for the given record index or io::Result::Err
643    fn idx_by_rid(&self, rid: usize) -> io::Result<IndexRecord> {
644        match self.index.inner.get(rid) {
645            Some(record) => Ok(record.clone()),
646            None => Err(io::Error::new(
647                io::ErrorKind::Other,
648                "Invalid record index in fasta file.",
649            )),
650        }
651    }
652
653    /// Seek to the given position in the specified FASTA record. The position
654    /// of the cursor on the line that the seek ended on is returned.
655    fn seek_to(&mut self, idx: &IndexRecord, start: u64) -> io::Result<u64> {
656        assert!(start <= idx.len);
657
658        let line_offset = start % idx.line_bases;
659        let line_start = start / idx.line_bases * idx.line_bytes;
660        let offset = idx.offset + line_start + line_offset;
661        self.reader.seek(io::SeekFrom::Start(offset))?;
662
663        Ok(line_offset)
664    }
665
666    /// Tries to read up to `bases_left` bases from the current line into `buf`,
667    /// returning the actual number of bases read. Depending on the amount of
668    /// whitespace per line, the current `line_offset`, and the amount of bytes
669    /// returned from `BufReader::fill_buf`, this function may return Ok(0)
670    /// multiple times in a row.
671    fn read_line(
672        &mut self,
673        idx: &IndexRecord,
674        line_offset: &mut u64,
675        bases_left: u64,
676        buf: &mut Vec<u8>,
677    ) -> io::Result<u64> {
678        let (bytes_to_read, bytes_to_keep) = {
679            let src = self.reader.fill_buf()?;
680            if src.is_empty() {
681                return Err(io::Error::new(
682                    io::ErrorKind::UnexpectedEof,
683                    "FASTA file is truncated.",
684                ));
685            }
686
687            let bases_on_line = idx.line_bases - min(idx.line_bases, *line_offset);
688            let bases_in_buffer = min(src.len() as u64, bases_on_line);
689
690            let (bytes_to_read, bytes_to_keep) = if bases_in_buffer <= bases_left {
691                let bytes_to_read = min(src.len() as u64, idx.line_bytes - *line_offset);
692
693                (bytes_to_read, bases_in_buffer)
694            } else {
695                (bases_left, bases_left)
696            };
697
698            buf.extend_from_slice(&src[..bytes_to_keep as usize]);
699            (bytes_to_read, bytes_to_keep)
700        };
701
702        self.reader.consume(bytes_to_read as usize);
703
704        assert!(bytes_to_read > 0);
705        *line_offset += bytes_to_read;
706        if *line_offset >= idx.line_bytes {
707            *line_offset = 0;
708        }
709
710        Ok(bytes_to_keep)
711    }
712}
713
714/// Record of a FASTA index.
715#[derive(Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
716struct IndexRecord {
717    name: String,
718    len: u64,
719    offset: u64,
720    line_bases: u64,
721    line_bytes: u64,
722}
723
724/// A sequence record returned by the FASTA index.
725#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
726pub struct Sequence {
727    pub name: String,
728    pub len: u64,
729}
730
731#[derive(Debug)]
732pub struct IndexedReaderIterator<'a, R: io::Read + io::Seek> {
733    reader: &'a mut IndexedReader<R>,
734    record: IndexRecord,
735    bases_left: u64,
736    line_offset: u64,
737    buf: Vec<u8>,
738    buf_idx: usize,
739}
740
741impl<'a, R: io::Read + io::Seek + 'a> IndexedReaderIterator<'a, R> {
742    fn fill_buffer(&mut self) -> io::Result<()> {
743        assert!(self.bases_left > 0);
744
745        self.buf.clear();
746        let bases_to_read = min(self.buf.capacity() as u64, self.bases_left);
747
748        // May loop one or more times; see IndexedReader::read_line.
749        while self.buf.is_empty() {
750            self.bases_left -= self.reader.read_line(
751                &self.record,
752                &mut self.line_offset,
753                bases_to_read,
754                &mut self.buf,
755            )?;
756        }
757
758        self.buf_idx = 0;
759        Ok(())
760    }
761}
762
763impl<'a, R: io::Read + io::Seek + 'a> Iterator for IndexedReaderIterator<'a, R> {
764    type Item = io::Result<u8>;
765
766    fn next(&mut self) -> Option<Self::Item> {
767        if self.buf_idx < self.buf.len() {
768            let item = Some(Ok(self.buf[self.buf_idx]));
769            self.buf_idx += 1;
770            item
771        } else if self.bases_left > 0 {
772            if let Err(e) = self.fill_buffer() {
773                self.bases_left = 0;
774                self.buf_idx = self.buf.len();
775
776                return Some(Err(e));
777            }
778
779            self.buf_idx = 1;
780            Some(Ok(self.buf[0]))
781        } else {
782            None
783        }
784    }
785
786    fn size_hint(&self) -> (usize, Option<usize>) {
787        let hint = self.bases_left as usize + (self.buf.len() - self.buf_idx);
788
789        (hint, Some(hint))
790    }
791}
792
793/// A Fasta writer.
794#[derive(Debug)]
795pub struct Writer<W: io::Write> {
796    writer: io::BufWriter<W>,
797    linewrap: Option<usize>,
798}
799
800impl Writer<fs::File> {
801    /// Write to the given file path.
802    #[allow(clippy::wrong_self_convention)]
803    pub fn to_file<P: AsRef<Path>>(path: P) -> io::Result<Self> {
804        fs::File::create(path).map(Writer::new)
805    }
806
807    /// Write to the given file path and a buffer capacity
808    pub fn to_file_with_capacity<P: AsRef<Path>>(capacity: usize, path: P) -> io::Result<Self> {
809        fs::File::create(path).map(|file| Writer::with_capacity(capacity, file))
810    }
811}
812
813impl<W: io::Write> Writer<W> {
814    /// Create a new Fasta writer.
815    pub fn new(writer: W) -> Self {
816        Writer {
817            writer: io::BufWriter::new(writer),
818            linewrap: None,
819        }
820    }
821
822    /// Create a new Fasta writer with a capacity of write buffer
823    pub fn with_capacity(capacity: usize, writer: W) -> Self {
824        Writer {
825            writer: io::BufWriter::with_capacity(capacity, writer),
826            linewrap: None,
827        }
828    }
829
830    /// Create a new Fasta writer with a given BufWriter
831    pub fn from_bufwriter(bufwriter: io::BufWriter<W>) -> Self {
832        Writer {
833            writer: bufwriter,
834            linewrap: None,
835        }
836    }
837
838    /// Directly write a [`fasta::Record`](struct.Record.html).
839    ///
840    /// # Errors
841    /// If there is an issue writing to the `Writer`.
842    ///
843    /// # Examples
844    /// ```rust
845    /// use bio::io::fasta::{Record, Writer};
846    /// use std::fs;
847    /// use std::io;
848    /// use std::path::Path;
849    ///
850    /// let path = Path::new("test.fa");
851    /// let file = fs::File::create(path).unwrap();
852    /// {
853    ///     let handle = io::BufWriter::new(file);
854    ///     let mut writer = Writer::new(handle);
855    ///     let record = Record::with_attrs("id", Some("desc"), b"ACGT");
856    ///
857    ///     let write_result = writer.write_record(&record);
858    ///     assert!(write_result.is_ok());
859    /// }
860    ///
861    /// let actual = fs::read_to_string(path).unwrap();
862    /// let expected = ">id desc\nACGT\n";
863    ///
864    /// assert!(fs::remove_file(path).is_ok());
865    /// assert_eq!(actual, expected)
866    /// ```
867    pub fn write_record(&mut self, record: &Record) -> io::Result<()> {
868        self.write(record.id(), record.desc(), record.seq())
869    }
870
871    /// Set line wrapping behavior.
872    ///
873    /// # Examples
874    /// ```rust
875    /// use bio::io::fasta::{Record, Writer};
876    /// use std::fs;
877    /// use std::io;
878    /// use std::path::Path;
879    ///
880    /// let path = Path::new("test.fa");
881    /// let file = fs::File::create(path).unwrap();
882    /// {
883    ///     let handle = io::BufWriter::new(file);
884    ///     let mut writer = Writer::new(handle);
885    ///
886    ///     // For demonstration width is 4 chars, use 50, 60 or 70 instead for production
887    ///     writer.set_linewrap(Some(4));
888    ///
889    ///     let record = Record::with_attrs("id", Some("desc"), b"ACGTACGT");
890    ///     let write_result = writer.write_record(&record);
891    ///     assert!(write_result.is_ok());
892    /// }
893    ///
894    /// let actual = fs::read_to_string(path).unwrap();
895    /// let expected = ">id desc\nACGT\nACGT\n";
896    ///
897    /// assert!(fs::remove_file(path).is_ok());
898    /// assert_eq!(actual, expected)
899    /// ```
900    pub fn set_linewrap(&mut self, linewrap: Option<usize>) {
901        self.linewrap = linewrap
902    }
903
904    pub fn write_record_header(&mut self, id: &str, desc: Option<&str>) -> io::Result<()> {
905        self.writer.write_all(b">")?;
906        self.writer.write_all(id.as_bytes())?;
907        if let Some(desc) = desc {
908            self.writer.write_all(b" ")?;
909            self.writer.write_all(desc.as_bytes())?;
910        }
911        self.writer.write_all(b"\n")?;
912
913        Ok(())
914    }
915
916    /// Write a Fasta record with given id, optional description and sequence.
917    pub fn write(&mut self, id: &str, desc: Option<&str>, seq: TextSlice<'_>) -> io::Result<()> {
918        self.write_record_header(id, desc)?;
919        if self.linewrap == None {
920            self.writer.write_all(seq)?;
921            self.writer.write_all(b"\n")?;
922            Ok(())
923        } else {
924            // Write Fasta lines with a given linewrap instead of in a single line
925            seq.chunks(self.linewrap.unwrap())
926                .try_for_each(|chunk| -> io::Result<()> {
927                    self.writer.write_all(chunk)?;
928                    self.writer.write_all(b"\n")?;
929
930                    Ok(())
931                })
932        }
933    }
934
935    /// Flush the writer, ensuring that everything is written.
936    pub fn flush(&mut self) -> io::Result<()> {
937        self.writer.flush()
938    }
939}
940
941/// A FASTA record.
942#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
943pub struct Record {
944    id: String,
945    desc: Option<String>,
946    seq: String,
947}
948
949impl Record {
950    /// Create a new instance.
951    pub fn new() -> Self {
952        Record {
953            id: String::new(),
954            desc: None,
955            seq: String::new(),
956        }
957    }
958
959    /// Create a new `Record` from given attributes.
960    ///
961    /// # Examples
962    /// ```rust
963    /// use bio::io::fasta::Record;
964    ///
965    /// let read_id = "read1";
966    /// let description = Some("sampleid=foobar");
967    /// let sequence = b"ACGT";
968    /// let record = Record::with_attrs(read_id, description, sequence);
969    ///
970    /// assert_eq!(">read1 sampleid=foobar\nACGT\n", record.to_string())
971    /// ```
972    pub fn with_attrs(id: &str, desc: Option<&str>, seq: TextSlice<'_>) -> Self {
973        let desc = desc.map(|desc| desc.to_owned());
974        Record {
975            id: id.to_owned(),
976            desc,
977            seq: String::from_utf8(seq.to_vec()).unwrap(),
978        }
979    }
980
981    /// Check if record is empty.
982    pub fn is_empty(&self) -> bool {
983        self.id.is_empty() && self.desc.is_none() && self.seq.is_empty()
984    }
985
986    /// Check validity of Fasta record.
987    pub fn check(&self) -> Result<(), &str> {
988        if self.id().is_empty() {
989            return Err("Expecting id for Fasta record.");
990        }
991        if !self.seq.is_ascii() {
992            return Err("Non-ascii character found in sequence.");
993        }
994
995        Ok(())
996    }
997
998    /// Return the id of the record.
999    pub fn id(&self) -> &str {
1000        self.id.as_ref()
1001    }
1002
1003    /// Return descriptions if present.
1004    pub fn desc(&self) -> Option<&str> {
1005        match self.desc.as_ref() {
1006            Some(desc) => Some(desc),
1007            None => None,
1008        }
1009    }
1010
1011    /// Return the sequence of the record.
1012    pub fn seq(&self) -> TextSlice<'_> {
1013        self.seq.as_bytes()
1014    }
1015
1016    /// Clear the record.
1017    fn clear(&mut self) {
1018        self.id.clear();
1019        self.desc = None;
1020        self.seq.clear();
1021    }
1022}
1023
1024impl fmt::Display for Record {
1025    /// Allows for using `Record` in a given formatter `f`. In general this is for
1026    /// creating a `String` representation of a `Record` and, optionally, writing it to
1027    /// a file.
1028    ///
1029    /// # Errors
1030    /// Returns [`std::fmt::Error`](https://doc.rust-lang.org/std/fmt/struct.Error.html)
1031    /// if there is an issue formatting to the stream.
1032    ///
1033    /// # Examples
1034    ///
1035    /// Read in a Fasta `Record` and create a `String` representation of it.
1036    ///
1037    /// ```rust
1038    /// use bio::io::fasta::Reader;
1039    /// use std::fmt::Write;
1040    /// // create a "fake" fasta file
1041    /// let fasta: &'static [u8] = b">id comment1 comment2\nACGT\n";
1042    /// let mut records = Reader::new(fasta).records().map(|r| r.unwrap());
1043    /// let record = records.next().unwrap();
1044    ///
1045    /// let mut actual = String::new();
1046    /// // populate `actual` with a string representation of our record
1047    /// write!(actual, "{}", record).unwrap();
1048    ///
1049    /// let expected = std::str::from_utf8(fasta).unwrap();
1050    ///
1051    /// assert_eq!(actual, expected)
1052    /// ```
1053    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
1054        let header = match self.desc() {
1055            Some(d) => format!("{} {}", self.id().to_owned(), d),
1056            None => self.id().to_owned(),
1057        };
1058        write!(
1059            f,
1060            ">{}\n{}\n",
1061            header,
1062            std::str::from_utf8(self.seq()).unwrap(),
1063        )
1064    }
1065}
1066
1067/// An iterator over the records of a Fasta file.
1068#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
1069pub struct Records<B>
1070where
1071    B: io::BufRead,
1072{
1073    reader: Reader<B>,
1074    error_has_occured: bool,
1075}
1076
1077impl<B> Iterator for Records<B>
1078where
1079    B: io::BufRead,
1080{
1081    type Item = io::Result<Record>;
1082
1083    fn next(&mut self) -> Option<io::Result<Record>> {
1084        if self.error_has_occured {
1085            None
1086        } else {
1087            let mut record = Record::new();
1088            match self.reader.read(&mut record) {
1089                Ok(()) if record.is_empty() => None,
1090                Ok(()) => Some(Ok(record)),
1091                Err(err) => {
1092                    self.error_has_occured = true;
1093                    Some(Err(err))
1094                }
1095            }
1096        }
1097    }
1098}
1099
1100#[cfg(test)]
1101mod tests {
1102    use super::*;
1103    use std::fmt::Write as FmtWrite;
1104    use std::io;
1105
1106    const FASTA_FILE: &[u8] = b">id desc
1107ACCGTAGGCTGA
1108CCGTAGGCTGAA
1109CGTAGGCTGAAA
1110GTAGGCTGAAAA
1111CCCC
1112>id2
1113ATTGTTGTTTTA
1114ATTGTTGTTTTA
1115ATTGTTGTTTTA
1116GGGG
1117";
1118    const FAI_FILE: &[u8] = b"id\t52\t9\t12\t13
1119id2\t40\t71\t12\t13
1120";
1121
1122    const TRUNCATED_FASTA: &[u8] = b">id desc\nACCGTAGGCTGA";
1123
1124    const FASTA_FILE_CRLF: &[u8] = b">id desc\r
1125ACCGTAGGCTGA\r
1126CCGTAGGCTGAA\r
1127CGTAGGCTGAAA\r
1128GTAGGCTGAAAA\r
1129CCCC\r
1130>id2\r
1131ATTGTTGTTTTA\r
1132ATTGTTGTTTTA\r
1133ATTGTTGTTTTA\r
1134GGGG\r
1135";
1136    const FAI_FILE_CRLF: &[u8] = b"id\t52\t10\t12\t14\r
1137id2\t40\t78\t12\t14\r
1138";
1139
1140    const FASTA_FILE_NO_TRAILING_LF: &[u8] = b">id desc
1141GTAGGCTGAAAA
1142CCCC";
1143    const FAI_FILE_NO_TRAILING_LF: &[u8] = b"id\t16\t9\t12\t13";
1144
1145    const WRITE_FASTA_FILE: &[u8] = b">id desc
1146ACCGTAGGCTGA
1147>id2
1148ATTGTTGTTTTA
1149";
1150    const WRITE_FASTA_FILE_WIDTH: &[u8] = b">id desc
1151ACCG
1152TAGG
1153CTGA
1154>id2
1155ATTG
1156TTGT
1157TTTA
1158";
1159
1160    struct ReaderMock {
1161        seek_fails: bool,
1162        read_fails: bool,
1163    }
1164
1165    impl Read for ReaderMock {
1166        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
1167            if self.read_fails {
1168                Err(io::Error::new(io::ErrorKind::Other, "Read set to fail"))
1169            } else {
1170                Ok(buf.len())
1171            }
1172        }
1173    }
1174
1175    impl Seek for ReaderMock {
1176        fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
1177            if let io::SeekFrom::Start(pos) = pos {
1178                if self.seek_fails {
1179                    Err(io::Error::new(io::ErrorKind::Other, "Seek set to fail"))
1180                } else {
1181                    Ok(pos)
1182                }
1183            } else {
1184                unimplemented!();
1185            }
1186        }
1187    }
1188
1189    #[test]
1190    fn test_reader() {
1191        let reader = Reader::new(FASTA_FILE);
1192        let ids = ["id", "id2"];
1193        let descs = [Some("desc"), None];
1194        let seqs: [&[u8]; 2] = [
1195            b"ACCGTAGGCTGACCGTAGGCTGAACGTAGGCTGAAAGTAGGCTGAAAACCCC",
1196            b"ATTGTTGTTTTAATTGTTGTTTTAATTGTTGTTTTAGGGG",
1197        ];
1198
1199        for (i, r) in reader.records().enumerate() {
1200            let record = r.expect("Error reading record");
1201            assert_eq!(record.check(), Ok(()));
1202            assert_eq!(record.id(), ids[i]);
1203            assert_eq!(record.desc(), descs[i]);
1204            assert_eq!(record.seq(), seqs[i]);
1205        }
1206
1207        let reader = Reader::with_capacity(100, FASTA_FILE);
1208
1209        for (i, r) in reader.records().enumerate() {
1210            let record = r.expect("Error reading record");
1211            assert_eq!(record.check(), Ok(()));
1212            assert_eq!(record.id(), ids[i]);
1213            assert_eq!(record.desc(), descs[i]);
1214            assert_eq!(record.seq(), seqs[i]);
1215        }
1216
1217        let reader = Reader::from_bufread(io::BufReader::new(FASTA_FILE));
1218
1219        for (i, r) in reader.records().enumerate() {
1220            let record = r.expect("Error reading record");
1221            assert_eq!(record.check(), Ok(()));
1222            assert_eq!(record.id(), ids[i]);
1223            assert_eq!(record.desc(), descs[i]);
1224            assert_eq!(record.seq(), seqs[i]);
1225        }
1226    }
1227
1228    #[test]
1229    fn test_faread_trait() {
1230        let path = "genome.fa.gz";
1231        let mut fa_reader: Box<dyn FastaRead> = match path.ends_with(".gz") {
1232            true => Box::new(Reader::new(io::BufReader::new(FASTA_FILE))),
1233            false => Box::new(Reader::new(FASTA_FILE)),
1234        };
1235        // The read method can be called, since it is implemented by
1236        // FQRead. Right now, the records method would not work.
1237        let mut record = Record::new();
1238        fa_reader.read(&mut record).unwrap();
1239        // Check if the returned result is correct.
1240        assert_eq!(record.check(), Ok(()));
1241        assert_eq!(record.id(), "id");
1242        assert_eq!(record.desc(), Some("desc"));
1243        assert_eq!(
1244            record.seq().to_vec(),
1245            b"ACCGTAGGCTGACCGTAGGCTGAACGTAGGCTGAAAGTAGGCTGAAAACCCC".to_vec()
1246        );
1247    }
1248
1249    #[test]
1250    fn test_reader_wrong_header() {
1251        let mut reader = Reader::new(&b"!test\nACGTA\n"[..]);
1252        let mut record = Record::new();
1253        assert!(
1254            reader.read(&mut record).is_err(),
1255            "read() should return Err if FASTA header is malformed"
1256        );
1257    }
1258
1259    #[test]
1260    fn test_reader_no_id() {
1261        let mut reader = Reader::new(&b">\nACGTA\n"[..]);
1262        let mut record = Record::new();
1263        reader.read(&mut record).unwrap();
1264        assert!(
1265            record.check().is_err(),
1266            "check() should return Err if FASTA header is empty"
1267        );
1268    }
1269
1270    #[test]
1271    fn test_reader_non_ascii_sequence() {
1272        let mut reader = Reader::new(&b">id\nACGTA\xE2\x98\xB9AT\n"[..]);
1273        let mut record = Record::new();
1274        reader.read(&mut record).unwrap();
1275        assert!(
1276            record.check().is_err(),
1277            "check() should return Err if FASTA sequence is not ASCII"
1278        );
1279    }
1280
1281    #[test]
1282    fn test_reader_read_fails() {
1283        let mut reader = Reader::new(ReaderMock {
1284            seek_fails: false,
1285            read_fails: true,
1286        });
1287        let mut record = Record::new();
1288        assert!(
1289            reader.read(&mut record).is_err(),
1290            "read() should return Err if Read::read fails"
1291        );
1292    }
1293
1294    #[test]
1295    fn test_reader_read_fails_iter() {
1296        let reader = Reader::new(ReaderMock {
1297            seek_fails: false,
1298            read_fails: true,
1299        });
1300        let mut records = reader.records();
1301
1302        assert!(
1303            records.next().unwrap().is_err(),
1304            "next() should return Err if Read::read fails"
1305        );
1306        assert!(
1307            records.next().is_none(),
1308            "next() should return None after error has occurred"
1309        );
1310    }
1311
1312    #[test]
1313    fn test_reader_from_file_path_doesnt_exist_returns_err() {
1314        let path = Path::new("/I/dont/exist.fasta");
1315        let error = Reader::from_file(path)
1316            .unwrap_err()
1317            .downcast::<String>()
1318            .unwrap();
1319
1320        assert_eq!(&error, "Failed to read fasta from \"/I/dont/exist.fasta\"")
1321    }
1322
1323    #[test]
1324    fn test_record_with_attrs_without_description() {
1325        let record = Record::with_attrs("id_str", None, b"ATGCGGG");
1326        assert_eq!(record.id(), "id_str");
1327        assert_eq!(record.desc(), None);
1328        assert_eq!(record.seq(), b"ATGCGGG");
1329    }
1330
1331    #[test]
1332    fn test_record_with_attrs_with_description() {
1333        let record = Record::with_attrs("id_str", Some("desc"), b"ATGCGGG");
1334        assert_eq!(record.id(), "id_str");
1335        assert_eq!(record.desc(), Some("desc"));
1336        assert_eq!(record.seq(), b"ATGCGGG");
1337    }
1338
1339    #[test]
1340    fn test_index_sequences() {
1341        let reader = IndexedReader::new(io::Cursor::new(FASTA_FILE), FAI_FILE).unwrap();
1342
1343        let sequences = reader.index.sequences();
1344        assert_eq!(sequences.len(), 2);
1345        assert_eq!(
1346            sequences[0],
1347            Sequence {
1348                name: "id".into(),
1349                len: 52,
1350            }
1351        );
1352        assert_eq!(
1353            sequences[1],
1354            Sequence {
1355                name: "id2".into(),
1356                len: 40,
1357            }
1358        );
1359    }
1360
1361    #[test]
1362    fn test_indexed_reader() {
1363        _test_indexed_reader(FASTA_FILE, FAI_FILE, _read_buffer);
1364        _test_indexed_reader_truncated(_read_buffer);
1365        _test_indexed_reader_extreme_whitespace(_read_buffer);
1366    }
1367
1368    #[test]
1369    fn test_indexed_reader_crlf() {
1370        _test_indexed_reader(FASTA_FILE_CRLF, FAI_FILE_CRLF, _read_buffer);
1371    }
1372
1373    #[test]
1374    fn test_indexed_reader_iter() {
1375        _test_indexed_reader(FASTA_FILE, FAI_FILE, _read_iter);
1376        _test_indexed_reader_truncated(_read_iter);
1377        _test_indexed_reader_extreme_whitespace(_read_iter);
1378    }
1379
1380    #[test]
1381    fn test_indexed_reader_iter_crlf() {
1382        _test_indexed_reader(FASTA_FILE_CRLF, FAI_FILE_CRLF, _read_iter);
1383    }
1384
1385    fn _test_indexed_reader<'a, F>(fasta: &'a [u8], fai: &'a [u8], read: F)
1386    where
1387        F: Fn(&mut IndexedReader<io::Cursor<&'a [u8]>>, &str, u64, u64) -> io::Result<Vec<u8>>,
1388    {
1389        let mut reader = IndexedReader::new(io::Cursor::new(fasta), fai).unwrap();
1390
1391        // Test reading various substrings of the sequence
1392        assert_eq!(read(&mut reader, "id", 1, 5).unwrap(), b"CCGT");
1393        assert_eq!(
1394            read(&mut reader, "id", 1, 31).unwrap(),
1395            b"CCGTAGGCTGACCGTAGGCTGAACGTAGGC"
1396        );
1397        assert_eq!(read(&mut reader, "id", 13, 23).unwrap(), b"CGTAGGCTGA");
1398        assert_eq!(
1399            read(&mut reader, "id", 36, 52).unwrap(),
1400            b"GTAGGCTGAAAACCCC"
1401        );
1402        assert_eq!(
1403            read(&mut reader, "id2", 12, 40).unwrap(),
1404            b"ATTGTTGTTTTAATTGTTGTTTTAGGGG"
1405        );
1406        assert_eq!(read(&mut reader, "id2", 12, 12).unwrap(), b"");
1407        assert_eq!(read(&mut reader, "id2", 12, 13).unwrap(), b"A");
1408        // Minimal sequence spanning new-line
1409        assert_eq!(read(&mut reader, "id", 11, 13).unwrap(), b"AC");
1410
1411        assert!(read(&mut reader, "id2", 12, 11).is_err());
1412        assert!(read(&mut reader, "id2", 12, 1000).is_err());
1413        assert!(read(&mut reader, "id3", 0, 1).is_err());
1414    }
1415
1416    fn _test_indexed_reader_truncated<'a, F>(read: F)
1417    where
1418        F: Fn(&mut IndexedReader<io::Cursor<&'a [u8]>>, &str, u64, u64) -> io::Result<Vec<u8>>,
1419    {
1420        let mut reader = IndexedReader::new(io::Cursor::new(TRUNCATED_FASTA), FAI_FILE).unwrap();
1421
1422        assert_eq!(read(&mut reader, "id", 0, 12).unwrap(), b"ACCGTAGGCTGA");
1423        assert!(read(&mut reader, "id", 0, 13).is_err()); // read past EOF
1424        assert!(read(&mut reader, "id", 36, 52).is_err()); // seek and read past EOF
1425        assert!(read(&mut reader, "id2", 12, 40).is_err()); // seek and read past EOF
1426    }
1427
1428    fn _test_indexed_reader_extreme_whitespace<F>(read: F)
1429    where
1430        F: Fn(&mut IndexedReader<io::Cursor<Vec<u8>>>, &str, u64, u64) -> io::Result<Vec<u8>>,
1431    {
1432        // Test to exercise the case where we cannot consume all whitespace at once. More than
1433        // DEFAULT_BUF_SIZE (a non-public constant set to 8 * 1024) whitespace is used to ensure
1434        // that it can't all fit in the BufReader at once.
1435        let mut seq = Vec::new();
1436        seq.push(b'A');
1437        seq.resize(10000, b' ');
1438        seq.push(b'B');
1439
1440        let fasta = io::Cursor::new(seq);
1441        let fai = io::Cursor::new(Vec::from(&b"id\t2\t0\t1\t10000"[..]));
1442        let mut reader = IndexedReader::new(fasta, fai).unwrap();
1443
1444        assert_eq!(read(&mut reader, "id", 0, 2).unwrap(), b"AB");
1445    }
1446
1447    fn _read_buffer<T>(
1448        reader: &mut IndexedReader<T>,
1449        seqname: &str,
1450        start: u64,
1451        stop: u64,
1452    ) -> io::Result<Vec<u8>>
1453    where
1454        T: Seek + Read,
1455    {
1456        let mut seq = vec![];
1457        reader.fetch(seqname, start, stop)?;
1458        reader.read(&mut seq)?;
1459
1460        Ok(seq)
1461    }
1462
1463    fn _read_iter<T>(
1464        reader: &mut IndexedReader<T>,
1465        seqname: &str,
1466        start: u64,
1467        stop: u64,
1468    ) -> io::Result<Vec<u8>>
1469    where
1470        T: Seek + Read,
1471    {
1472        let mut seq = vec![];
1473        reader.fetch(seqname, start, stop)?;
1474        for nuc in reader.read_iter()? {
1475            seq.push(nuc?);
1476        }
1477
1478        Ok(seq)
1479    }
1480
1481    #[test]
1482    fn test_indexed_reader_all() {
1483        _test_indexed_reader_all(FASTA_FILE, FAI_FILE, _read_buffer_all);
1484    }
1485
1486    #[test]
1487    fn test_indexed_reader_crlf_all() {
1488        _test_indexed_reader_all(FASTA_FILE_CRLF, FAI_FILE_CRLF, _read_buffer_all);
1489    }
1490
1491    #[test]
1492    fn test_indexed_reader_iter_all() {
1493        _test_indexed_reader_all(FASTA_FILE, FAI_FILE, _read_iter_all);
1494    }
1495
1496    #[test]
1497    fn test_indexed_reader_iter_crlf_all() {
1498        _test_indexed_reader_all(FASTA_FILE_CRLF, FAI_FILE_CRLF, _read_iter_all);
1499    }
1500
1501    fn _test_indexed_reader_all<'a, F>(fasta: &'a [u8], fai: &'a [u8], read: F)
1502    where
1503        F: Fn(&mut IndexedReader<io::Cursor<&'a [u8]>>, &str) -> io::Result<Vec<u8>>,
1504    {
1505        let mut reader = IndexedReader::new(io::Cursor::new(fasta), fai).unwrap();
1506
1507        assert_eq!(
1508            read(&mut reader, "id").unwrap(),
1509            &b"ACCGTAGGCTGACCGTAGGCTGAACGTAGGCTGAAAGTAGGCTGAAAACCCC"[..]
1510        );
1511        assert_eq!(
1512            read(&mut reader, "id2").unwrap(),
1513            &b"ATTGTTGTTTTAATTGTTGTTTTAATTGTTGTTTTAGGGG"[..]
1514        );
1515    }
1516
1517    fn _read_buffer_all<T>(reader: &mut IndexedReader<T>, seqname: &str) -> io::Result<Vec<u8>>
1518    where
1519        T: Seek + Read,
1520    {
1521        let mut seq = vec![];
1522        reader.fetch_all(seqname)?;
1523        reader.read(&mut seq)?;
1524
1525        Ok(seq)
1526    }
1527
1528    fn _read_iter_all<T>(reader: &mut IndexedReader<T>, seqname: &str) -> io::Result<Vec<u8>>
1529    where
1530        T: Seek + Read,
1531    {
1532        let mut seq = vec![];
1533        reader.fetch_all(seqname)?;
1534        for nuc in reader.read_iter()? {
1535            seq.push(nuc?);
1536        }
1537
1538        Ok(seq)
1539    }
1540
1541    #[test]
1542    fn test_indexed_reader_by_rid_all() {
1543        _test_indexed_reader_by_rid_all(FASTA_FILE, FAI_FILE, _read_buffer_by_rid_all);
1544    }
1545
1546    #[test]
1547    fn test_indexed_reader_crlf_by_rid_all() {
1548        _test_indexed_reader_by_rid_all(FASTA_FILE_CRLF, FAI_FILE_CRLF, _read_buffer_by_rid_all);
1549    }
1550
1551    #[test]
1552    fn test_indexed_reader_iter_by_rid_all() {
1553        _test_indexed_reader_by_rid_all(FASTA_FILE, FAI_FILE, _read_iter_by_rid_all);
1554    }
1555
1556    #[test]
1557    fn test_indexed_reader_iter_crlf_by_rid_all() {
1558        _test_indexed_reader_by_rid_all(FASTA_FILE_CRLF, FAI_FILE_CRLF, _read_iter_by_rid_all);
1559    }
1560
1561    fn _test_indexed_reader_by_rid_all<'a, F>(fasta: &'a [u8], fai: &'a [u8], read: F)
1562    where
1563        F: Fn(&mut IndexedReader<io::Cursor<&'a [u8]>>, usize) -> io::Result<Vec<u8>>,
1564    {
1565        let mut reader = IndexedReader::new(io::Cursor::new(fasta), fai).unwrap();
1566
1567        assert_eq!(
1568            read(&mut reader, 0).unwrap(),
1569            &b"ACCGTAGGCTGACCGTAGGCTGAACGTAGGCTGAAAGTAGGCTGAAAACCCC"[..]
1570        );
1571        assert_eq!(
1572            read(&mut reader, 1).unwrap(),
1573            &b"ATTGTTGTTTTAATTGTTGTTTTAATTGTTGTTTTAGGGG"[..]
1574        );
1575    }
1576
1577    fn _read_buffer_by_rid_all<T>(
1578        reader: &mut IndexedReader<T>,
1579        seq_index: usize,
1580    ) -> io::Result<Vec<u8>>
1581    where
1582        T: Seek + Read,
1583    {
1584        let mut seq = vec![];
1585        reader.fetch_all_by_rid(seq_index)?;
1586        reader.read(&mut seq)?;
1587
1588        Ok(seq)
1589    }
1590
1591    fn _read_iter_by_rid_all<T>(
1592        reader: &mut IndexedReader<T>,
1593        seq_index: usize,
1594    ) -> io::Result<Vec<u8>>
1595    where
1596        T: Seek + Read,
1597    {
1598        let mut seq = vec![];
1599        reader.fetch_all_by_rid(seq_index)?;
1600        for nuc in reader.read_iter()? {
1601            seq.push(nuc?);
1602        }
1603
1604        Ok(seq)
1605    }
1606
1607    #[test]
1608    fn test_indexed_reader_iter_size_hint() {
1609        let mut reader = IndexedReader::new(io::Cursor::new(FASTA_FILE), FAI_FILE).unwrap();
1610        reader.fetch("id", 2, 4).unwrap();
1611        let mut iterator = reader.read_iter().unwrap();
1612
1613        assert_eq!(iterator.size_hint(), (2, Some(2)));
1614        assert_eq!(iterator.next().unwrap().unwrap(), b'C');
1615        assert_eq!(iterator.size_hint(), (1, Some(1)));
1616        assert_eq!(iterator.next().unwrap().unwrap(), b'G');
1617        assert_eq!(iterator.size_hint(), (0, Some(0)));
1618        assert!(iterator.next().is_none());
1619        assert_eq!(iterator.size_hint(), (0, Some(0)));
1620    }
1621
1622    #[test]
1623    fn test_indexed_reader_reused_buffer() {
1624        let mut reader = IndexedReader::new(io::Cursor::new(FASTA_FILE), FAI_FILE).unwrap();
1625        let mut seq = Vec::new();
1626
1627        reader.fetch("id", 1, 5).unwrap();
1628        reader.read(&mut seq).unwrap();
1629        assert_eq!(seq, b"CCGT");
1630
1631        reader.fetch("id", 13, 23).unwrap();
1632        reader.read(&mut seq).unwrap();
1633        assert_eq!(seq, b"CGTAGGCTGA");
1634    }
1635
1636    #[test]
1637    fn test_indexed_reader_no_trailing_lf() {
1638        let mut reader = IndexedReader::new(
1639            io::Cursor::new(FASTA_FILE_NO_TRAILING_LF),
1640            FAI_FILE_NO_TRAILING_LF,
1641        )
1642        .unwrap();
1643        let mut seq = Vec::new();
1644
1645        reader.fetch("id", 0, 16).unwrap();
1646        reader.read(&mut seq).unwrap();
1647        assert_eq!(seq, b"GTAGGCTGAAAACCCC");
1648    }
1649
1650    #[test]
1651    fn test_indexed_reader_bad_reader() {
1652        let bad_reader = ReaderMock {
1653            seek_fails: false,
1654            read_fails: false,
1655        };
1656        let mut reader = IndexedReader::new(bad_reader, FAI_FILE).unwrap();
1657        let mut seq = Vec::new();
1658        reader.fetch("id", 0, 10).unwrap();
1659        assert!(reader.read(&mut seq).is_ok())
1660    }
1661
1662    #[test]
1663    fn test_indexed_reader_read_seek_fails() {
1664        let bad_reader = ReaderMock {
1665            seek_fails: true,
1666            read_fails: false,
1667        };
1668        let mut reader = IndexedReader::new(bad_reader, FAI_FILE).unwrap();
1669        let mut seq = Vec::new();
1670        reader.fetch("id", 0, 10).unwrap();
1671        assert!(reader.read(&mut seq).is_err());
1672    }
1673
1674    #[test]
1675    fn test_indexed_reader_read_read_fails() {
1676        let bad_reader = ReaderMock {
1677            seek_fails: false,
1678            read_fails: true,
1679        };
1680        let mut reader = IndexedReader::new(bad_reader, FAI_FILE).unwrap();
1681        let mut seq = Vec::new();
1682        reader.fetch("id", 0, 10).unwrap();
1683        assert!(reader.read(&mut seq).is_err());
1684    }
1685
1686    #[test]
1687    fn test_indexed_reader_iter_seek_fails() {
1688        let bad_reader = ReaderMock {
1689            seek_fails: true,
1690            read_fails: false,
1691        };
1692        let mut reader = IndexedReader::new(bad_reader, FAI_FILE).unwrap();
1693        reader.fetch("id", 0, 10).unwrap();
1694        assert!(reader.read_iter().is_err());
1695    }
1696
1697    #[test]
1698    fn test_indexed_reader_iter_read_fails() {
1699        let bad_reader = ReaderMock {
1700            seek_fails: false,
1701            read_fails: true,
1702        };
1703        let mut reader = IndexedReader::new(bad_reader, FAI_FILE).unwrap();
1704        reader.fetch("id", 0, 10).unwrap();
1705        let mut iterator = reader.read_iter().unwrap();
1706        assert!(iterator.next().unwrap().is_err());
1707        assert!(
1708            iterator.next().is_none(),
1709            "next() should return none after error has occurred"
1710        );
1711    }
1712
1713    #[test]
1714    fn test_indexed_reader_no_fetch_read_fails() {
1715        let reader = ReaderMock {
1716            seek_fails: false,
1717            read_fails: false,
1718        };
1719        let mut reader = IndexedReader::new(reader, FAI_FILE).unwrap();
1720        let mut seq = vec![];
1721        assert!(reader.read(&mut seq).is_err());
1722    }
1723
1724    #[test]
1725    fn test_indexed_reader_no_fetch_read_iter_fails() {
1726        let reader = ReaderMock {
1727            seek_fails: false,
1728            read_fails: false,
1729        };
1730        let mut reader = IndexedReader::new(reader, FAI_FILE).unwrap();
1731        assert!(reader.read_iter().is_err());
1732    }
1733
1734    #[test]
1735    fn test_writer() {
1736        let mut writer = Writer::new(Vec::new());
1737        writer.write("id", Some("desc"), b"ACCGTAGGCTGA").unwrap();
1738        writer.write("id2", None, b"ATTGTTGTTTTA").unwrap();
1739        writer.flush().unwrap();
1740        assert_eq!(writer.writer.get_ref(), &WRITE_FASTA_FILE);
1741
1742        let mut writer = Writer::with_capacity(100, Vec::new());
1743        writer.write("id", Some("desc"), b"ACCGTAGGCTGA").unwrap();
1744        writer.write("id2", None, b"ATTGTTGTTTTA").unwrap();
1745        writer.flush().unwrap();
1746        assert_eq!(writer.writer.get_ref(), &WRITE_FASTA_FILE);
1747
1748        let mut writer = Writer::from_bufwriter(std::io::BufWriter::with_capacity(100, Vec::new()));
1749        writer.write("id", Some("desc"), b"ACCGTAGGCTGA").unwrap();
1750        writer.write("id2", None, b"ATTGTTGTTTTA").unwrap();
1751        writer.flush().unwrap();
1752        assert_eq!(writer.writer.get_ref(), &WRITE_FASTA_FILE);
1753    }
1754
1755    #[test]
1756    fn test_display_record_no_desc_id_without_space_after() {
1757        let fasta: &'static [u8] = b">id\nACGT\n";
1758        let mut records = Reader::new(fasta).records().map(|r| r.unwrap());
1759        let record = records.next().unwrap();
1760        let mut actual = String::new();
1761        write!(actual, "{}", record).unwrap();
1762
1763        let expected = std::str::from_utf8(fasta).unwrap();
1764
1765        assert_eq!(actual, expected)
1766    }
1767
1768    #[test]
1769    fn test_display_record_with_desc_id_has_space_between_id_and_desc() {
1770        let fasta: &'static [u8] = b">id comment1 comment2\nACGT\n";
1771        let mut records = Reader::new(fasta).records().map(|r| r.unwrap());
1772        let record = records.next().unwrap();
1773        let mut actual = String::new();
1774        write!(actual, "{}", record).unwrap();
1775
1776        let expected = std::str::from_utf8(fasta).unwrap();
1777
1778        assert_eq!(actual, expected)
1779    }
1780
1781    #[test]
1782    fn test_index_record_idx_by_rid_invalid_index_returns_error() {
1783        let reader = ReaderMock {
1784            seek_fails: false,
1785            read_fails: false,
1786        };
1787        let index_reader = IndexedReader::new(reader, FAI_FILE).unwrap();
1788
1789        let actual = index_reader.idx_by_rid(99999).unwrap_err();
1790        let expected = io::Error::new(io::ErrorKind::Other, "Invalid record index in fasta file.");
1791
1792        assert_eq!(actual.kind(), expected.kind());
1793        assert_eq!(actual.to_string(), expected.to_string())
1794    }
1795
1796    #[test]
1797    fn test_index_record_fetch_by_rid_second_index_returns_second_record() {
1798        let reader = ReaderMock {
1799            seek_fails: false,
1800            read_fails: false,
1801        };
1802        let mut index_reader = IndexedReader::new(reader, FAI_FILE).unwrap();
1803
1804        let actual = index_reader.fetch_by_rid(1, 1, 3);
1805
1806        assert!(actual.is_ok());
1807        assert_eq!(
1808            index_reader.fetched_idx,
1809            Some(IndexRecord {
1810                name: String::from("id2"),
1811                len: 40,
1812                offset: 71,
1813                line_bases: 12,
1814                line_bytes: 13
1815            })
1816        )
1817    }
1818
1819    #[test]
1820    fn test_writer_to_file_dir_doesnt_exist_returns_err() {
1821        let path = Path::new("/I/dont/exist.fa");
1822
1823        let actual = Writer::to_file(path).unwrap_err();
1824        let expected = io::Error::new(io::ErrorKind::NotFound, "foo");
1825
1826        assert_eq!(actual.kind(), expected.kind());
1827    }
1828
1829    #[test]
1830    fn test_writer_to_file_dir_exists_returns_ok() {
1831        let file = tempfile::NamedTempFile::new().expect("Could not create temp file");
1832        let path = file.path();
1833
1834        assert!(Writer::to_file(path).is_ok());
1835        assert!(Writer::to_file_with_capacity(100, path).is_ok());
1836    }
1837
1838    #[test]
1839    fn test_write_record() {
1840        let path = Path::new("test.fa");
1841        let file = fs::File::create(path).unwrap();
1842        {
1843            let handle = io::BufWriter::new(file);
1844            let mut writer = Writer {
1845                writer: handle,
1846                linewrap: Some(4),
1847            };
1848            let record = Record::with_attrs("id", Some("desc"), b"ACGT");
1849
1850            let write_result = writer.write_record(&record);
1851            assert!(write_result.is_ok());
1852        }
1853
1854        let actual = fs::read_to_string(path).unwrap();
1855        let expected = ">id desc\nACGT\n";
1856
1857        assert!(fs::remove_file(path).is_ok());
1858        assert_eq!(actual, expected)
1859    }
1860
1861    #[test]
1862    fn test_write_with_linewrap() {
1863        let width = 4;
1864        let mut writer = Writer::new(Vec::new());
1865        writer.set_linewrap(Some(width));
1866        writer.write("id", Some("desc"), b"ACCGTAGGCTGA").unwrap();
1867        writer.write("id2", None, b"ATTGTTGTTTTA").unwrap();
1868        writer.flush().unwrap();
1869        assert_eq!(writer.writer.get_ref(), &WRITE_FASTA_FILE_WIDTH);
1870
1871        let mut writer = Writer::with_capacity(100, Vec::new());
1872        writer.set_linewrap(Some(width));
1873        writer.write("id", Some("desc"), b"ACCGTAGGCTGA").unwrap();
1874        writer.write("id2", None, b"ATTGTTGTTTTA").unwrap();
1875        writer.flush().unwrap();
1876        assert_eq!(writer.writer.get_ref(), &WRITE_FASTA_FILE_WIDTH);
1877
1878        let mut writer = Writer::from_bufwriter(std::io::BufWriter::with_capacity(100, Vec::new()));
1879        writer.set_linewrap(Some(width));
1880        writer.write("id", Some("desc"), b"ACCGTAGGCTGA").unwrap();
1881        writer.write("id2", None, b"ATTGTTGTTTTA").unwrap();
1882        writer.flush().unwrap();
1883        assert_eq!(writer.writer.get_ref(), &WRITE_FASTA_FILE_WIDTH);
1884    }
1885}