seq_io/
lib.rs

1//! This library provides an(other) attempt at high performance FASTA and FASTQ parsing and writing.
2//! The FASTA parser can read and write multi-line files. The FASTQ parser supports only single
3//! lines.
4//!
5//! By default, the parsers avoid allocations and copying as much as possible.
6//! [`fasta::RefRecord`](fasta/struct.RefRecord.html) and
7//! [`fastq::RefRecord`](fastq/struct.RefRecord.html) borrow from the underlying buffered
8//! reader. In addition, `fasta::RefRecord` offers the
9//! [`seq_lines()`](fasta/struct.RefRecord.html#method.seq_lines) method,
10//! which allows iterating over individual sequence lines in a multi-line FASTA file
11//! without the need to copy the data.
12//!
13//! By default, both parsers use a buffer of 64 KiB size. If a record with a longer
14//! sequence is encountered, the buffer will automatically grow. How it grows can be
15//! configured. See [below](#large-sequences) for more information.
16//!
17//! # More detailed documentation
18//!
19//! Please refer to the module docs for more information on how to use the reading and writing
20//! functions, as well as information on the exact parsing behaviour:
21//!
22//! * [`fasta module`](fasta) and [`fasta::Reader`](fasta/struct.Reader.html)
23//! * [`fastq module`](fastq) and [`fastq::Reader`](fastq/struct.Reader.html)
24//!
25//! # Example FASTQ parser:
26//!
27//! This code prints the ID string from each FASTQ record.
28//!
29//! ```no_run
30//! use seq_io::fastq::{Reader,Record};
31//!
32//! let mut reader = Reader::from_path("seqs.fastq").unwrap();
33//!
34//! while let Some(record) = reader.next() {
35//!     let record = record.expect("Error reading record");
36//!     println!("{}", record.id().unwrap());
37//! }
38//! ```
39//!
40//! # Example FASTA parser calculating mean sequence length:
41//!
42//! The FASTA reader works just the same. One challenge with the FASTA
43//! format is that the sequence can be broken into multiple lines.
44//! Therefore, it is not always possible to get a slice to the whole sequence
45//! without copying the data. But it is possible to use `seq_lines()`
46//! for efficiently iterating over each sequence line:
47//!
48//! ```no_run
49//! use seq_io::fasta::{Reader,Record};
50//!
51//! let mut reader = Reader::from_path("seqs.fasta").unwrap();
52//!
53//! let mut n = 0;
54//! let mut sum = 0;
55//! while let Some(record) = reader.next() {
56//!     let record = record.expect("Error reading record");
57//!     for s in record.seq_lines() {
58//!         sum += s.len();
59//!     }
60//!     n += 1;
61//! }
62//! println!("mean sequence length of {} records: {:.1} bp", n, sum as f32 / n as f32);
63//! ```
64//! If the whole sequence is required at once, there is the
65//! [`full_seq`](fasta/struct.RefRecord.html#method.full_seq),
66//! which will only allocate the sequence if there are multiple lines.
67//! use seq_io::fasta::{Reader,OwnedRecord};
68//!
69//! # Large sequences
70//!
71//! Due to the design of the parsers, each sequence record must fit into the underlying
72//! buffer as a whole. There are different ways to deal with large sequences:
73//! It is possible configure initial buffer size using `Reader::with_capacity()`.
74//! However, the buffer will also automatically double its size if a record doesn't fit.
75//! How it grows can be configured by applying another policy.
76//!
77//! For example, the readers can be configured to return
78//! [`fasta::Error::BufferLimit`](fasta/enum.Error.html#variant.BufferLimit) /
79//! [`fastq::Error::BufferLimit`](fastq/enum.Error.html#variant.BufferLimit)
80//! if buffer size grows too large. This is done using `set_policy()`:
81//!
82//! ```no_run
83//! use seq_io::fasta::Reader;
84//! use seq_io::policy::DoubleUntilLimited;
85//!
86//! // The buffer doubles its size until 128 MiB, then grows by steps
87//! // of 128 MiB. If it reaches 1 GiB, there will be an error.
88//! let policy = DoubleUntilLimited::new(1 << 30, 1 << 32);
89//! let mut reader = Reader::from_path("input.fasta").unwrap()
90//!     .set_policy(policy);
91//! // (...)
92//! ```
93//! For information on how to create a custom policy, refer to the
94//! [`policy`](policy) module docs.
95//!
96//! # Owned records
97//! Both readers also provide iterators similar to *Rust-Bio*, which return owned data. This
98//! is slower, but make sense, e.g. if the records are collected in to a vector:
99//!
100//! ```no_run
101//! use seq_io::fasta::Reader;
102//!
103//! let mut reader = Reader::from_path("input.fasta").unwrap();
104//!
105//! let records: Result<Vec<_>, _> = reader.records().collect();
106//! ```
107//!
108//! # Parallel processing
109//! Functions for parallel processing can be found in the [`parallel`](parallel/index.html) module
110
111#[macro_use]
112extern crate serde_derive;
113
114use std::error;
115use std::fmt;
116use std::io;
117
118macro_rules! try_opt {
119    ($expr: expr) => {
120        match $expr {
121            Ok(item) => item,
122            Err(e) => return Some(Err(::std::convert::From::from(e))),
123        }
124    };
125}
126
127macro_rules! unwrap_or {
128    ($expr:expr, $or:block) => {
129        match $expr {
130            Some(item) => item,
131            None => $or,
132        }
133    };
134}
135
136pub mod fasta;
137pub mod fastq;
138pub mod parallel;
139pub mod policy;
140
141/// Remove a final '\r' from a byte slice
142#[inline]
143fn trim_cr(line: &[u8]) -> &[u8] {
144    if let Some((&b'\r', remaining)) = line.split_last() {
145        remaining
146    } else {
147        line
148    }
149}
150
151/// Makes sure the buffer is full after this call (unless EOF reached)
152/// code adapted from `io::Read::read_exact`
153#[inline(never)]
154fn fill_buf<R>(
155    reader: &mut buffer_redux::BufReader<R, buffer_redux::policy::StdPolicy>,
156) -> io::Result<usize>
157where
158    R: io::Read,
159{
160    let initial_size = reader.buffer().len();
161    let mut num_read = 0;
162    while initial_size + num_read < reader.capacity() {
163        match reader.read_into_buf() {
164            Ok(0) => break,
165            Ok(n) => num_read += n,
166            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
167            Err(e) => return Err(e),
168        }
169    }
170    Ok(num_read)
171}
seq_io/lib.rs

seq_io/
lib.rs