fire_fasta/lib.rs
1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3
4//! # Fire-Fasta
5//! Ultra-fast, lightweight, zero-copy, lazy Multi-FASTA parser.
6//!
7//! The parser is intended for high performance applications where the input is expected to be well-formed.
8//! Therefore, it sacrifices input validation and deprecated features for parsing performance.
9//!
10//! ### Sequence Characters
11//! The parser makes no assumptions about the sequence alphabet:
12//! It is explicitly intended for custom sequences with characters that do not conform to NCBI specifications.
13//! The only characters not allowed in sequences are unix-style newlines (`LF`),
14//! which are ignored, and the greater-than sign (`>`),
15//! which starts a new sequence descriptor in Multi-FASTA files.
16//! Note, that the parser does not validate whether a sequence description starts at the beginning of a new line.
17//!
18//! The parser expects input data that is compatible with ASCII.
19//! Multibyte UTF-8 codepoints are processed as separate ASCII characters.
20//!
21//! Windows-style newlines (`CRLF`) are not supported.
22//! Instead, the parser treats the `LF` as a unix-style newline and preserve the `CR` as a valid sequence character.
23//! Old FASTA comments starting with `;` are also not supported, they are treated as part of the sequence.
24//!
25//! ### Usage and Lazy Parsing
26//! Calling the parser does one pass over the entire input, separating individual fasta sequences from each other.
27//! No further processing is done and no data is copied.
28//! ```rust
29//! # use fire_fasta::parse_fasta_str;
30//! # use std::error::Error;
31//! # fn main() -> Result<(), Box<dyn Error>> {
32//! let seq = ">example\nMSTIL\nAATIL\n\n";
33//! let fasta = parse_fasta_str(&seq)?;
34//! // or parse_fasta(&data) for &[u8] slices
35//!
36//! assert_eq!(fasta.sequences.len(), 1);
37//!
38//! // Iterating over a sequence removes newlines from the iterator on the fly:
39//! assert_eq!(
40//! String::from_utf8(fasta.sequences[0].iter().copied().collect::<Vec<_>>())?,
41//! "MSTILAATIL"
42//! );
43//!
44//! //If you want to iterate over a sequence multiple times, it may be faster to first copy the full sequence into its own buffer:
45//! let copied: Box<[u8]> = fasta.sequences[0].copy_sequential();
46//! assert_eq!(copied.as_ref(), b"MSTILAATIL");
47//! # Ok(())
48//! # }
49//! ```
50//!
51//! Parsing and copying use the [memchr](https://crates.io/crates/memchr) crate,
52//! and thus operations use SIMD instructions when available.
53
54use memchr::memchr;
55use std::error::Error;
56use std::fmt::{Display, Formatter};
57
58/// A Multi FASTA file containing zero, one, or more [`FastaSequences`].
59/// Access the sequences simply through its `sequences` field:
60///
61/// ```rust
62/// # use fire_fasta::parse_fasta;
63/// # use std::error::Error;
64/// # fn main() -> Result<(), Box<dyn Error>> {
65/// let fasta_file = b">Sample1\nACGTCA\n>Sample2\nACGTCC";
66/// let fasta = parse_fasta(fasta_file)?;
67///
68/// assert_eq!(fasta.sequences[0].description, b"Sample1");
69/// assert_eq!(fasta.sequences[1].description, b"Sample2");
70///
71/// assert_eq!(*fasta.sequences[0].iter().nth(2).unwrap(), b'G');
72/// # Ok(())
73/// # }
74/// ```
75///
76/// [`FastaSequences`]: FastaSequence
77#[derive(Clone, Debug)]
78pub struct Fasta<'a> {
79 /// A vector of sequences present in the fasta file.
80 pub sequences: Vec<FastaSequence<'a>>,
81}
82
83/// A FASTA sequence with a description from a FASTA file.
84/// The sequence is not processed in any way, meaning accessing it performs further parsing when necessary.
85#[derive(Clone, Debug)]
86pub struct FastaSequence<'a> {
87 /// A byte slice containing the sequence description (without the leading '>' character,
88 /// and without the trailing newline.
89 pub description: &'a [u8],
90 sequence: &'a [u8],
91}
92
93/// FASTA parsing error thrown during the initial parsing step in [`parse_fasta`]
94///
95/// [`parse_fasta`]: parse_fasta
96#[derive(Clone, Debug)]
97pub enum ParseError {
98 /// Invalid descriptor start character.
99 /// The parser expects any FASTA description line to start with '>'.
100 /// The invalid character is returned in the error.
101 ///
102 /// Since the parser doesn't mind excess newlines between sequences,
103 /// this error can only occur if the very first character of a FASTA file isn't a `>`.
104 /// If further descriptors in a Multi-FASTA file don't start with `>`, they are added to their
105 /// preceding sequence as valid sequence characters.
106 InvalidDescription {
107 /// The one-byte code point of the wrong descriptor character in the file.
108 invalid: u8,
109 },
110
111 /// A valid descriptor was parsed, but no sequence is following
112 EmptySequence,
113}
114
115impl Display for ParseError {
116 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
117 write!(f, "{self:?}")
118 }
119}
120
121impl Error for ParseError {}
122
123impl<'a> FastaSequence<'a> {
124 /// Returns an iterator over the FASTA sequence characters, excluding newlines.
125 /// Note that the parser expects unix-style line breaks, thus, CR-characters are preserved.
126 ///
127 /// Newlines are filtered out on the fly, meaning that multiple calls to `iter` repeatedly
128 /// search and skip them during iteration.
129 #[inline]
130 pub fn iter(&self) -> impl Iterator<Item = &u8> {
131 self.sequence.iter().filter(|&x| *x != b'\n')
132 }
133
134 /// Copy the sequence into a consecutive memory region.
135 /// This method allocates a buffer and copies the sequence into it, skipping newline symbols.
136 /// Note that any other symbols (including whitespace and line feeds) get preserved.
137 /// The capacity of the return value may be larger than the actual sequence.
138 /// It is guaranteed, however, that only one allocation is performed.
139 #[must_use]
140 pub fn copy_sequential(&self) -> Box<[u8]> {
141 let mut buffer = vec![0u8; self.size_hint()];
142 let mut target = 0;
143 let mut pos = 0;
144 loop {
145 let pivot = memchr(b'\n', &self.sequence[pos..]).unwrap_or(self.sequence.len() - pos);
146 buffer[target..target + pivot].copy_from_slice(&self.sequence[pos..pos + pivot]);
147 pos += pivot + 1;
148 target += pivot;
149
150 if pos >= self.sequence.len() {
151 break;
152 }
153 }
154 buffer.truncate(target);
155 buffer.into_boxed_slice()
156 }
157
158 /// Returns the maximum size in bytes this sequence occupies.
159 /// This size is a limit and could be smaller,
160 /// for example if newlines are filtered out of the sequence (see [`copy_sequential`])
161 ///
162 /// [`copy_sequential`]: FastaSequence::copy_sequential
163 pub fn size_hint(&self) -> usize {
164 self.sequence.len()
165 }
166}
167
168/// Parse a FASTA or Multi FASTA file.
169/// Sequence descriptions are expected to start with '>'.
170/// The deprecated comment character ';' is not parsed, neither for sequence descriptors nor for
171/// additional comment lines.
172/// Parsing is done lazily: Sequence descriptions and sequences are identified, but are not further
173/// processed.
174///
175/// # Errors
176/// If the file is not empty, but the first character is not a greater-than sign (`>`), the function
177/// returns an [`InvalidDescription`] error.
178///
179/// If the file ends in a valid FASTA sequence description, but no sequence follows, the function
180/// returns an [`EmptySequence`] error.
181///
182/// # Returns
183/// A [`Fasta`] instance containing all sequences from the Multi-Fasta file
184///
185/// [`InvalidDescription`]: ParseError::InvalidDescription
186/// [`EmptySequence`]: ParseError::EmptySequence
187pub fn parse_fasta_str(s: &str) -> Result<Fasta, ParseError> {
188 parse_fasta(s.as_bytes())
189}
190
191/// Parse a FASTA or Multi FASTA file.
192/// Sequence descriptions are expected to start with '>'.
193/// The deprecated comment character ';' is not parsed, neither for sequence descriptors nor for
194/// additional comment lines.
195/// Parsing is done lazily: Sequence descriptions and sequences are identified, but are not further
196/// processed.
197///
198/// # Errors
199/// If the file is not empty, but the first character is not a greater-than sign (`>`), the function
200/// returns an [`InvalidDescription`] error.
201///
202/// If the file ends in a valid FASTA sequence description, but no sequence follows, the function
203/// returns an [`EmptySequence`] error.
204///
205/// # Returns
206/// A [`Fasta`] instance containing all sequences from the Multi-Fasta file
207///
208/// [`InvalidDescription`]: ParseError::InvalidDescription
209/// [`EmptySequence`]: ParseError::EmptySequence
210pub fn parse_fasta(data: &[u8]) -> Result<Fasta, ParseError> {
211 let mut sequences = Vec::new();
212
213 if data.is_empty() {
214 return Ok(Fasta { sequences });
215 }
216
217 let mut cursor = 0usize;
218
219 loop {
220 if !expect(data, b'>', &mut cursor) {
221 return Err(ParseError::InvalidDescription {
222 invalid: data[cursor],
223 });
224 }
225
226 let header_end = memchr(b'\n', &data[cursor..]).unwrap_or(data.len() - cursor);
227 let description = &data[cursor..cursor + header_end];
228 cursor += header_end + 1;
229
230 if cursor >= data.len() {
231 return Err(ParseError::EmptySequence);
232 }
233
234 let sequence_end = memchr(b'>', &data[cursor..]).unwrap_or(data.len() - cursor);
235 // may contain trailing white space
236 let sequence = &data[cursor..cursor + sequence_end];
237 cursor += sequence_end;
238
239 sequences.push(FastaSequence {
240 description,
241 sequence,
242 });
243
244 if cursor >= data.len() {
245 break;
246 }
247 }
248
249 Ok(Fasta { sequences })
250}
251
252/// Expect that the byte at [cursor] is equal to [expected]. If it is, advance the cursor by one.
253/// Returns false, if the byte is not equal to the expected byte.
254#[inline]
255fn expect(data: &[u8], expected: u8, cursor: &mut usize) -> bool {
256 if data[*cursor] == expected {
257 *cursor += 1;
258 true
259 } else {
260 false
261 }
262}
263
264#[cfg(test)]
265mod tests;