fire_fasta/
lib.rs

1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3
4//! # Fire-Fasta
5//! Ultra-fast, lightweight, zero-copy, lazy Multi-FASTA parser.
6//!
7//! The parser is intended for high performance applications where the input is expected to be well-formed.
8//! Therefore, it sacrifices input validation and deprecated features for parsing performance.
9//!
10//! ### Sequence Characters
11//! The parser makes no assumptions about the sequence alphabet:
12//! It is explicitly intended for custom sequences with characters that do not conform to NCBI specifications.
13//! The only characters not allowed in sequences are unix-style newlines (`LF`),
14//! which are ignored, and the greater-than sign (`>`),
15//! which starts a new sequence descriptor in Multi-FASTA files.
16//! Note, that the parser does not validate whether a sequence description starts at the beginning of a new line.
17//!
18//! The parser expects input data that is compatible with ASCII.
19//! Multibyte UTF-8 codepoints are processed as separate ASCII characters.
20//!
21//! Windows-style newlines (`CRLF`) are not supported.
22//! Instead, the parser treats the `LF` as a unix-style newline and preserve the `CR` as a valid sequence character.
23//! Old FASTA comments starting with `;` are also not supported, they are treated as part of the sequence.
24//!
25//! ### Usage and Lazy Parsing
26//! Calling the parser does one pass over the entire input, separating individual fasta sequences from each other.
27//! No further processing is done and no data is copied.
28//! ```rust
29//! # use fire_fasta::parse_fasta_str;
30//! # use std::error::Error;
31//! # fn main() -> Result<(), Box<dyn Error>> {
32//! let seq = ">example\nMSTIL\nAATIL\n\n";
33//! let fasta = parse_fasta_str(&seq)?;
34//! // or parse_fasta(&data) for &[u8] slices
35//!
36//! assert_eq!(fasta.sequences.len(), 1);
37//!
38//! // Iterating over a sequence removes newlines from the iterator on the fly:
39//! assert_eq!(
40//!     String::from_utf8(fasta.sequences[0].iter().copied().collect::<Vec<_>>())?,
41//!     "MSTILAATIL"
42//! );
43//!
44//! //If you want to iterate over a sequence multiple times, it may be faster to first copy the full sequence into its own buffer:
45//! let copied: Box<[u8]> = fasta.sequences[0].copy_sequential();
46//! assert_eq!(copied.as_ref(), b"MSTILAATIL");
47//! # Ok(())
48//! # }
49//! ```
50//!
51//! Parsing and copying use the [memchr](https://crates.io/crates/memchr) crate,
52//! and thus operations use SIMD instructions when available.
53
54use memchr::memchr;
55use std::error::Error;
56use std::fmt::{Display, Formatter};
57
58/// A Multi FASTA file containing zero, one, or more [`FastaSequences`].
59/// Access the sequences simply through its `sequences` field:
60///
61/// ```rust
62/// # use fire_fasta::parse_fasta;
63/// # use std::error::Error;
64/// # fn main() -> Result<(), Box<dyn Error>> {
65/// let fasta_file = b">Sample1\nACGTCA\n>Sample2\nACGTCC";
66/// let fasta = parse_fasta(fasta_file)?;
67///
68/// assert_eq!(fasta.sequences[0].description, b"Sample1");
69/// assert_eq!(fasta.sequences[1].description, b"Sample2");
70///
71/// assert_eq!(*fasta.sequences[0].iter().nth(2).unwrap(), b'G');
72/// # Ok(())
73/// # }
74/// ```
75///
76/// [`FastaSequences`]: FastaSequence
77#[derive(Clone, Debug)]
78pub struct Fasta<'a> {
79    /// A vector of sequences present in the fasta file.
80    pub sequences: Vec<FastaSequence<'a>>,
81}
82
83/// A FASTA sequence with a description from a FASTA file.
84/// The sequence is not processed in any way, meaning accessing it performs further parsing when necessary.
85#[derive(Clone, Debug)]
86pub struct FastaSequence<'a> {
87    /// A byte slice containing the sequence description (without the leading '>' character,
88    /// and without the trailing newline.
89    pub description: &'a [u8],
90    sequence: &'a [u8],
91}
92
93/// FASTA parsing error thrown during the initial parsing step in [`parse_fasta`]
94///
95/// [`parse_fasta`]: parse_fasta
96#[derive(Clone, Debug)]
97pub enum ParseError {
98    /// Invalid descriptor start character.
99    /// The parser expects any FASTA description line to start with '>'.
100    /// The invalid character is returned in the error.
101    ///
102    /// Since the parser doesn't mind excess newlines between sequences,
103    /// this error can only occur if the very first character of a FASTA file isn't a `>`.
104    /// If further descriptors in a Multi-FASTA file don't start with `>`, they are added to their
105    /// preceding sequence as valid sequence characters.
106    InvalidDescription {
107        /// The one-byte code point of the wrong descriptor character in the file.
108        invalid: u8,
109    },
110
111    /// A valid descriptor was parsed, but no sequence is following
112    EmptySequence,
113}
114
115impl Display for ParseError {
116    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
117        write!(f, "{self:?}")
118    }
119}
120
121impl Error for ParseError {}
122
123impl<'a> FastaSequence<'a> {
124    /// Returns an iterator over the FASTA sequence characters, excluding newlines.
125    /// Note that the parser expects unix-style line breaks, thus, CR-characters are preserved.
126    ///
127    /// Newlines are filtered out on the fly, meaning that multiple calls to `iter` repeatedly
128    /// search and skip them during iteration.
129    #[inline]
130    pub fn iter(&self) -> impl Iterator<Item = &u8> {
131        self.sequence.iter().filter(|&x| *x != b'\n')
132    }
133
134    /// Copy the sequence into a consecutive memory region.
135    /// This method allocates a buffer and copies the sequence into it, skipping newline symbols.
136    /// Note that any other symbols (including whitespace and line feeds) get preserved.
137    /// The capacity of the return value may be larger than the actual sequence.
138    /// It is guaranteed, however, that only one allocation is performed.
139    #[must_use]
140    pub fn copy_sequential(&self) -> Box<[u8]> {
141        let mut buffer = vec![0u8; self.size_hint()];
142        let mut target = 0;
143        let mut pos = 0;
144        loop {
145            let pivot = memchr(b'\n', &self.sequence[pos..]).unwrap_or(self.sequence.len() - pos);
146            buffer[target..target + pivot].copy_from_slice(&self.sequence[pos..pos + pivot]);
147            pos += pivot + 1;
148            target += pivot;
149
150            if pos >= self.sequence.len() {
151                break;
152            }
153        }
154        buffer.truncate(target);
155        buffer.into_boxed_slice()
156    }
157
158    /// Returns the maximum size in bytes this sequence occupies.
159    /// This size is a limit and could be smaller,
160    /// for example if newlines are filtered out of the sequence (see [`copy_sequential`])
161    ///
162    /// [`copy_sequential`]: FastaSequence::copy_sequential
163    pub fn size_hint(&self) -> usize {
164        self.sequence.len()
165    }
166}
167
168/// Parse a FASTA or Multi FASTA file.
169/// Sequence descriptions are expected to start with '>'.
170/// The deprecated comment character ';' is not parsed, neither for sequence descriptors nor for
171/// additional comment lines.
172/// Parsing is done lazily: Sequence descriptions and sequences are identified, but are not further
173/// processed.
174///
175/// # Errors
176/// If the file is not empty, but the first character is not a greater-than sign (`>`), the function
177/// returns an [`InvalidDescription`] error.
178///
179/// If the file ends in a valid FASTA sequence description, but no sequence follows, the function
180/// returns an [`EmptySequence`] error.
181///
182/// # Returns
183/// A [`Fasta`] instance containing all sequences from the Multi-Fasta file
184///
185/// [`InvalidDescription`]: ParseError::InvalidDescription
186/// [`EmptySequence`]: ParseError::EmptySequence
187pub fn parse_fasta_str(s: &str) -> Result<Fasta, ParseError> {
188    parse_fasta(s.as_bytes())
189}
190
191/// Parse a FASTA or Multi FASTA file.
192/// Sequence descriptions are expected to start with '>'.
193/// The deprecated comment character ';' is not parsed, neither for sequence descriptors nor for
194/// additional comment lines.
195/// Parsing is done lazily: Sequence descriptions and sequences are identified, but are not further
196/// processed.
197///
198/// # Errors
199/// If the file is not empty, but the first character is not a greater-than sign (`>`), the function
200/// returns an [`InvalidDescription`] error.
201///
202/// If the file ends in a valid FASTA sequence description, but no sequence follows, the function
203/// returns an [`EmptySequence`] error.
204///
205/// # Returns
206/// A [`Fasta`] instance containing all sequences from the Multi-Fasta file
207///
208/// [`InvalidDescription`]: ParseError::InvalidDescription
209/// [`EmptySequence`]: ParseError::EmptySequence
210pub fn parse_fasta(data: &[u8]) -> Result<Fasta, ParseError> {
211    let mut sequences = Vec::new();
212
213    if data.is_empty() {
214        return Ok(Fasta { sequences });
215    }
216
217    let mut cursor = 0usize;
218
219    loop {
220        if !expect(data, b'>', &mut cursor) {
221            return Err(ParseError::InvalidDescription {
222                invalid: data[cursor],
223            });
224        }
225
226        let header_end = memchr(b'\n', &data[cursor..]).unwrap_or(data.len() - cursor);
227        let description = &data[cursor..cursor + header_end];
228        cursor += header_end + 1;
229
230        if cursor >= data.len() {
231            return Err(ParseError::EmptySequence);
232        }
233
234        let sequence_end = memchr(b'>', &data[cursor..]).unwrap_or(data.len() - cursor);
235        // may contain trailing white space
236        let sequence = &data[cursor..cursor + sequence_end];
237        cursor += sequence_end;
238
239        sequences.push(FastaSequence {
240            description,
241            sequence,
242        });
243
244        if cursor >= data.len() {
245            break;
246        }
247    }
248
249    Ok(Fasta { sequences })
250}
251
252/// Expect that the byte at [cursor] is equal to [expected]. If it is, advance the cursor by one.
253/// Returns false, if the byte is not equal to the expected byte.
254#[inline]
255fn expect(data: &[u8], expected: u8, cursor: &mut usize) -> bool {
256    if data[*cursor] == expected {
257        *cursor += 1;
258        true
259    } else {
260        false
261    }
262}
263
264#[cfg(test)]
265mod tests;