1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
//! Handles all the FASTA/FASTQ parsing
use std::fs::File;
use std::io::{stdin, Cursor, Read};
use std::path::Path;

#[cfg(feature = "compression")]
use bzip2::read::BzDecoder;
#[cfg(feature = "compression")]
use flate2::read::MultiGzDecoder;
#[cfg(feature = "compression")]
use xz2::read::XzDecoder;

use crate::errors::ParseError;
pub use crate::parser::fasta::Reader as FastaReader;
pub use crate::parser::fastq::Reader as FastqReader;

mod record;
mod utils;

mod fasta;
mod fastq;

pub use crate::parser::utils::FastxReader;

// Magic bytes for each compression format
#[cfg(feature = "compression")]
const GZ_MAGIC: [u8; 2] = [0x1F, 0x8B];
#[cfg(feature = "compression")]
const BZ_MAGIC: [u8; 2] = [0x42, 0x5A];
#[cfg(feature = "compression")]
const XZ_MAGIC: [u8; 2] = [0xFD, 0x37];

fn get_fastx_reader<'a, R: 'a + io::Read + Send>(
    reader: R,
    first_byte: u8,
) -> Result<Box<dyn FastxReader + 'a>, ParseError> {
    match first_byte {
        b'>' => Ok(Box::new(FastaReader::new(reader))),
        b'@' => Ok(Box::new(FastqReader::new(reader))),
        _ => Err(ParseError::new_unknown_format(first_byte)),
    }
}

/// The main entry point of needletail if you're reading from something that implements [`std::io::Read`].
/// This automatically detects whether the file is:
/// 1. compressed: [`gzip`][gzip], [`bz`][bz] and [`xz`][xz] are supported and will use the appropriate decoder
/// 2. FASTA or FASTQ: the right parser will be automatically instantiated
///
/// Option 1 is only available if the `compression` feature is enabled.
///
/// # Errors
///
/// If the object you're reading from has less than 2 bytes then a [`ParserError`](needletail::errors::ParserError) of the kind
/// [`ParseErrorKind::EmptyFile`](needletail::errors::ParseErrorKind::EmptyFile) is returned.
///
/// If the first byte in the object is unknown, then a `ParserError` of the kind
/// [`ParseErrorKind::UnknownFormat`](needletail::errors::ParseErrorKind::UnknownFormat) is returned.
///
/// # Examples
///
/// ```
/// use needletail::parse_fastx_reader;
///
/// let reader = ">read1\nACGT\nread2\nGGGG".as_bytes();
/// let mut fastx_reader = parse_fastx_reader(reader).expect("invalid reader");
/// let mut idx = 0;
/// let read_ids = [b"read1", b"read2"];
///
/// while let Some(r) = fastx_reader.next() {
///     let record = r.expect("invalid record");
///     assert_eq!(record.id(), read_ids[idx]);
///     idx += 1;
/// }
/// ```
///
/// [gzip]: https://www.gnu.org/software/gzip/
/// [bz]: https://sourceware.org/bzip2/
/// [xz]: https://tukaani.org/xz/format.html
///
pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>(
    mut reader: R,
) -> Result<Box<dyn FastxReader + 'a>, ParseError> {
    let mut first_two_bytes = [0; 2];
    reader
        .read_exact(&mut first_two_bytes)
        .map_err(|_| ParseError::new_empty_file())?;
    let first_two_cursor = Cursor::new(first_two_bytes);
    let new_reader = first_two_cursor.chain(reader);

    match first_two_bytes {
        #[cfg(feature = "compression")]
        GZ_MAGIC => {
            let mut gz_reader = MultiGzDecoder::new(new_reader);
            let mut first = [0; 1];
            gz_reader.read_exact(&mut first)?;
            let r = Cursor::new(first).chain(gz_reader);
            get_fastx_reader(r, first[0])
        }
        #[cfg(feature = "compression")]
        BZ_MAGIC => {
            let mut bz_reader = BzDecoder::new(new_reader);
            let mut first = [0; 1];
            bz_reader.read_exact(&mut first)?;
            let r = Cursor::new(first).chain(bz_reader);
            get_fastx_reader(r, first[0])
        }
        #[cfg(feature = "compression")]
        XZ_MAGIC => {
            let mut xz_reader = XzDecoder::new(new_reader);
            let mut first = [0; 1];
            xz_reader.read_exact(&mut first)?;
            let r = Cursor::new(first).chain(xz_reader);
            get_fastx_reader(r, first[0])
        }
        _ => get_fastx_reader(new_reader, first_two_bytes[0]),
    }
}

/// The main entry point of needletail if you're reading from stdin.
/// Shortcut to calling `parse_fastx_reader` with `stdin()`
pub fn parse_fastx_stdin() -> Result<Box<dyn FastxReader>, ParseError> {
    let stdin = stdin();
    parse_fastx_reader(stdin)
}

/// The main entry point of needletail if you're reading from a file.
/// Shortcut to calling `parse_fastx_reader` with a file
pub fn parse_fastx_file<P: AsRef<Path>>(path: P) -> Result<Box<dyn FastxReader>, ParseError> {
    parse_fastx_reader(File::open(&path)?)
}

pub use record::{mask_header_tabs, mask_header_utf8, write_fasta, write_fastq, SequenceRecord};
use std::io;
pub use utils::{Format, LineEnding};

#[cfg(test)]
mod test {
    use crate::errors::ParseErrorKind;
    use crate::parse_fastx_reader;

    #[test]
    fn test_empty_file_raises_parser_error_of_same_kind() {
        let reader = "".as_bytes();
        let actual = parse_fastx_reader(reader);
        assert!(actual.is_err());

        let actual_err = actual.err().unwrap().kind;
        let expected_err = ParseErrorKind::EmptyFile;
        assert_eq!(actual_err, expected_err);
    }

    #[test]
    fn test_only_one_byte_in_file_raises_empty_file_error() {
        let reader = "@".as_bytes();
        let actual = parse_fastx_reader(reader);
        assert!(actual.is_err());

        let actual_err = actual.err().unwrap().kind;
        let expected_err = ParseErrorKind::EmptyFile;
        assert_eq!(actual_err, expected_err);
    }
}