fastx/
lib.rs

1/*
2inspired by seq_io, https://github.com/markschl/seq_io
3Copyright (c) 2021 Andreas Hauser <Andreas.Hauser@LMU.de>
4License: Attribution-ShareAlike 4.0 International
5 */
6
7#[allow(non_snake_case)]
8pub mod FastX
9{
10    use flate2::read::MultiGzDecoder;
11    use std::ffi::OsStr;
12    use std::io;
13    use std::io::BufRead;
14
15    const PER_THREAD_BUF_SIZE: usize = 600 * 1024 * 1024;
16
17    pub enum FastXFormat
18    {
19        FASTQ,
20        FASTA,
21        EOF,
22        UNKNOWN,
23    }
24
25    #[derive(Default)]
26    pub struct FastARecord
27    {
28        pub name: String,
29        pub raw_seq: Vec<u8>,
30    }
31
32    #[derive(Default)]
33    pub struct FastQRecord
34    {
35        name: String,
36        seq: Vec<u8>,
37        comment: String,
38        qual: Vec<u8>,
39    }
40
41    pub trait FastXRead: std::fmt::Display
42    {
43        fn read(&mut self, reader: &mut dyn BufRead) -> io::Result<usize>;
44        fn name(&self) -> &String;
45        fn id(&self) -> &str;
46        fn desc(&self) -> &str;
47        fn seq_raw(&self) -> &Vec<u8>;
48        fn seq(&self) -> Vec<u8>;
49        fn seq_len(&self) -> usize;
50        fn lines(&self) -> Vec<&[u8]>;
51    }
52
53    pub trait FastQRead: FastXRead
54    {
55        fn comment(&self) -> &str;
56        fn qual(&self) -> &Vec<u8>;
57    }
58
59    impl std::fmt::Display for FastARecord
60    {
61        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
62        {
63            write!(f, ">{}\n{}", self.name(), String::from_utf8_lossy(&self.seq()))
64        }
65    }
66
67    impl FastXRead for FastARecord
68    {
69        fn name(&self) -> &String
70        {
71            &self.name
72        }
73
74        fn id(&self) -> &str
75        {
76            match memchr::memchr(b' ', self.name.as_bytes())
77            {
78                None => &self.name,
79                Some(i) => &self.name[..i],
80            }
81        }
82
83        fn desc(&self) -> &str
84        {
85            match memchr::memchr(b' ', self.name.as_bytes())
86            {
87                None => "",
88                Some(i) => &self.name[i + 1..],
89            }
90        }
91
92        fn seq_raw(&self) -> &Vec<u8>
93        {
94            &self.raw_seq
95        }
96
97        fn seq(&self) -> Vec<u8>
98        {
99            let mut seq = vec![0; self.raw_seq.len()];
100            let mut line_start = 0;
101            let mut seq_end = 0;
102            let mut seq_start = 0;
103            memchr::memchr_iter(b'\n', &self.raw_seq).for_each(|line_end| {
104                seq_start = seq_end;
105                seq_end += line_end - line_start;
106                seq[seq_start..seq_end].copy_from_slice(&self.raw_seq[line_start..line_end]);
107                line_start = line_end + 1; // skip '\n'
108            });
109            if line_start < self.raw_seq.len()
110            {
111                seq_start = seq_end;
112                seq_end += self.raw_seq.len() - line_start;
113                seq[seq_start..seq_end]
114                    .copy_from_slice(&self.raw_seq[line_start..self.raw_seq.len()]);
115                seq.resize(seq_end, 0);
116            }
117            seq
118        }
119
120        fn lines(&self) -> Vec<&[u8]>
121        {
122            //self.seq.split( |c| *c == b'\n').collect()
123            let mut line_start = 0;
124            memchr::memchr_iter(b'\n', &self.raw_seq)
125                .map(|line_end| {
126                    let line = &self.raw_seq[line_start..line_end];
127                    line_start = line_end + 1;
128                    line
129                })
130                .collect()
131        }
132
133        fn seq_len(&self) -> usize
134        {
135            let mut line_start = 0;
136            memchr::memchr_iter(b'\n', &self.raw_seq).fold(0, |mut len, line_end| {
137                len += line_end - line_start;
138                line_start = line_end + 1;
139                len
140            }) + self.raw_seq.len()
141                - line_start
142        }
143
144        fn read(&mut self, reader: &mut dyn BufRead) -> io::Result<usize>
145        {
146            self.name.clear();
147            self.raw_seq.clear();
148
149            let mut size = 0;
150
151            let mut record_sep = [0_u8];
152            match reader.read(&mut record_sep)
153            {
154                Err(e) => return Err(e),
155                Ok(0) => return Ok(0),
156                Ok(some) =>
157                {
158                    size += some;
159                    if some != 1 && record_sep[0] != b'>'
160                    {
161                        return Err(io::Error::new(
162                            io::ErrorKind::InvalidData,
163                            "record_sep does not match '>'",
164                        ));
165                    }
166                }
167            };
168
169            match reader.read_line(&mut self.name)
170            {
171                Err(e) => return Err(e),
172                Ok(0) => return Ok(0),
173                Ok(some) => size += some,
174            };
175            rstrip_newline_string(&mut self.name);
176
177            match read_until_before(reader, b'>', &mut self.raw_seq)
178            {
179                Err(e) => Err(e),
180                Ok(0) => Ok(0),
181                Ok(some) =>
182                {
183                    rstrip_seq(&mut self.raw_seq);
184                    Ok(size + some)
185                }
186            }
187        }
188    }
189
190    impl std::fmt::Display for FastQRecord
191    {
192        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
193        {
194            write!(
195                f,
196                "@{}\n{}\n+\n{}",
197                self.name(),
198                String::from_utf8_lossy(&self.seq()),
199                String::from_utf8_lossy(&self.qual())
200            )
201        }
202    }
203
204    impl FastXRead for FastQRecord
205    {
206        fn name(&self) -> &String
207        {
208            &self.name
209        }
210
211        fn id(&self) -> &str
212        {
213            match memchr::memchr(b' ', self.name.as_bytes())
214            {
215                None => &self.name[1..],
216                Some(i) => &self.name[1..i],
217            }
218        }
219
220        fn desc(&self) -> &str
221        {
222            match memchr::memchr(b' ', self.name.as_bytes())
223            {
224                None => "",
225                Some(i) => &self.name[i + 1..],
226            }
227        }
228
229        fn seq_raw(&self) -> &Vec<u8>
230        {
231            &self.seq
232        }
233
234        // As multiline FastQ is very uncommon, we assume seq to be one line
235        fn seq(&self) -> Vec<u8>
236        {
237            self.seq.clone()
238        }
239
240        fn seq_len(&self) -> usize
241        {
242            self.seq
243                .split(|c| *c == b'\n')
244                .fold(0, |len, seq| len + seq.len())
245        }
246
247        fn lines(&self) -> Vec<&[u8]>
248        {
249            self.seq.split(|c| *c == b'\n').collect()
250        }
251
252        fn read(&mut self, reader: &mut dyn BufRead) -> io::Result<usize>
253        {
254            self.name.clear();
255            let mut size;
256            match reader.read_line(&mut self.name)
257            {
258                Err(e) => return Err(e),
259                Ok(0) => return Ok(0),
260                Ok(some) => size = some,
261            }
262            rstrip_newline_string(&mut self.name); //self.name.truncate(size - 1); // truncate newline XXX non UNIX
263            assert!(self.name.remove(0) == '@');
264
265            self.seq.clear();
266            match reader.read_until(b'\n', &mut self.seq)
267            {
268                Err(e) => return Err(e),
269                Ok(0) => return Ok(0),
270                Ok(some) =>
271                {
272                    rstrip_newline_vec(&mut self.seq);
273                    size += some;
274                }
275            }
276
277            self.comment.clear();
278            match reader.read_line(&mut self.comment)
279            {
280                Err(e) => return Err(e),
281                Ok(0) => return Ok(0),
282                Ok(some) =>
283                {
284                    rstrip_newline_string(&mut self.comment); //self.name.truncate(size - 1); // truncate newline XXX non UNIX
285                    size += some
286                }
287            }
288
289            self.qual.clear();
290            match reader.read_until(b'\n', &mut self.qual)
291            {
292                Err(e) => Err(e),
293                Ok(0) => Ok(0),
294                Ok(some) =>
295                {
296                    rstrip_newline_vec(&mut self.qual);
297                    Ok(size + some)
298                }
299            }
300        }
301    }
302
303    impl FastQRead for FastQRecord
304    {
305        fn comment(&self) -> &str
306        {
307            &self.comment[1..]
308        }
309
310        fn qual(&self) -> &Vec<u8>
311        {
312            &self.qual
313        }
314    }
315
316    fn rstrip_newline_string(s: &mut String)
317    {
318        while s.ends_with(&"\n")
319        {
320            s.truncate(s.len() - 1);
321        }
322    }
323
324    fn rstrip_seq(s: &mut Vec<u8>)
325    {
326        while s[s.len() - 1] == b'>' || s[s.len() - 1] == b'\n'
327        //.ends_with(&[b'>',b'\n'])
328        {
329            s.truncate(s.len() - 1);
330        }
331    }
332
333    fn rstrip_newline_vec(s: &mut Vec<u8>)
334    {
335        while s[s.len() - 1] == b'\n'
336        {
337            s.truncate(s.len() - 1);
338        }
339    }
340
341    // Determines the format from reading the first byte(s) of
342    // the Readable
343    pub fn peek(reader: &mut dyn BufRead) -> io::Result<(FastXFormat, u8)>
344    {
345        let buf = reader.fill_buf().expect("peek failed");
346        let format = match buf[0] as char
347        {
348            '>' => FastXFormat::FASTA,
349            '@' => FastXFormat::FASTQ,
350            '\0' => FastXFormat::EOF,
351            _ => FastXFormat::UNKNOWN,
352        };
353        if let FastXFormat::UNKNOWN = format
354        {
355            return Err(io::Error::new(
356                io::ErrorKind::InvalidData,
357                "Wrong format expected '>' or '@'!",
358            ));
359        }
360        Ok((format, buf[0]))
361    }
362
363    use std::fs::File;
364    use std::io::BufReader;
365    use std::path::Path;
366    //use std::str::pattern::Pattern;
367
368    pub fn reader_from_path(path: &Path) -> io::Result<Box<dyn BufRead>>
369    {
370        let file = File::open(path)?;
371        let reader: Box<dyn BufRead> = match path.extension()
372        {
373            Some(extension) if extension == OsStr::new("gz") => Box::new(BufReader::with_capacity(
374                PER_THREAD_BUF_SIZE,
375                MultiGzDecoder::new(BufReader::new(file)),
376            )),
377            _ => Box::new(BufReader::with_capacity(PER_THREAD_BUF_SIZE, file)),
378        };
379        Ok(reader)
380    }
381
382    pub fn from_reader(reader: &mut dyn BufRead) -> io::Result<Box<dyn FastXRead>>
383    {
384        let (format, first) = peek(reader)?;
385        match format
386        {
387            FastXFormat::FASTA => Ok(Box::new(FastARecord::default())),
388            FastXFormat::FASTQ => Ok(Box::new(FastQRecord::default())),
389            FastXFormat::EOF | FastXFormat::UNKNOWN =>
390            {
391                Err(io::Error::new(io::ErrorKind::InvalidData, format!("{:?}", first)))
392            }
393        }
394    }
395
396    /// from std::io::read_until, adapted to not consume the delimiter
397    fn read_until_before<R: BufRead + ?Sized>(
398        r: &mut R,
399        delim: u8,
400        buf: &mut Vec<u8>,
401    ) -> io::Result<usize>
402    {
403        let mut read = 0;
404        loop
405        {
406            let (done, used) = {
407                let available = match r.fill_buf()
408                {
409                    Ok(n) => n,
410                    //Err(ref e) if e.is_interrupted() => continue,
411                    Err(e) => return Err(e),
412                };
413                match memchr::memchr(delim, available)
414                {
415                    Some(i) =>
416                    {
417                        buf.extend_from_slice(&available[..=i]);
418                        (true, i + 1)
419                    }
420                    None =>
421                    {
422                        buf.extend_from_slice(available);
423                        (false, available.len())
424                    }
425                }
426            };
427            if done
428            {
429                r.consume(used - 1); // do not consume delimiter
430                read += used - 1;
431                return Ok(read);
432            }
433            else
434            {
435                r.consume(used);
436                read += used;
437            }
438            if used == 0
439            {
440                return Ok(read);
441            }
442        }
443    }
444}
445
446#[cfg(test)]
447mod tests
448{
449    use super::FastX::FastARecord;
450    use super::FastX::FastQRecord;
451    use super::FastX::FastXRead;
452    use std::io::BufReader;
453    use std::io::Cursor;
454
455    #[test]
456    fn fasta()
457    {
458        let mut x = BufReader::new(Cursor::new(">a\nAGTC\n>b\nTAGC\nTTTT\n>c\nGCTA"));
459        let mut record = FastARecord::default();
460        let _ = record.read(&mut x);
461        assert_eq!("a", record.name());
462        assert_eq!(b"AGTC".to_vec(), record.seq());
463        assert_eq!(&b"AGTC".to_vec(), record.seq_raw());
464        let _ = record.read(&mut x);
465        assert_eq!("b", record.name());
466        assert_eq!(b"TAGCTTTT".to_vec(), record.seq());
467        assert_eq!(&b"TAGC\nTTTT".to_vec(), record.seq_raw());
468        let _ = record.read(&mut x);
469        assert_eq!("c", record.name());
470        assert_eq!(b"GCTA".to_vec(), record.seq());
471        assert_eq!(&b"GCTA".to_vec(), record.seq_raw());
472    }
473
474    #[test]
475    fn fasta_memchr()
476    {
477        let fasta = b">a\nAGTC\n>b\nTAGC\nTTTT\n>c\nGCTA\n";
478        let mut reader = BufReader::new(Cursor::new(fasta));
479        let mut record = FastARecord::default();
480        let mut output: Vec<u8> = Vec::new();
481        //peek(&mut reader).expect("peek");
482        while let Ok(_some @ 1..=usize::MAX) = record.read(&mut reader)
483        {
484            output.push(b'>');
485            output.append(&mut record.name().as_bytes().to_vec());
486            output.push(b'\n');
487            let seq = record.seq_raw();
488            println!("r{}r", String::from_utf8_lossy(seq));
489            let mut offset = 0;
490            memchr::memchr_iter(b'\n', seq)
491                .map(|line_end| {
492                    let seq = &seq[offset..line_end];
493                    println!("#{}#", String::from_utf8_lossy(seq));
494                    offset = line_end + 1;
495                    seq
496                })
497                .for_each(|line| {
498                    output.append(&mut line.to_vec());
499                    output.push(b'\n');
500                });
501            output.append(&mut seq[offset..].to_vec());
502            output.push(b'\n');
503        }
504        println!("{}\n\n{}", String::from_utf8_lossy(fasta), String::from_utf8_lossy(&output));
505        assert_eq!(fasta.to_vec(), output);
506    }
507
508    #[test]
509    fn fastq()
510    {
511        let mut x = BufReader::new(Cursor::new(
512            "@a\nAGTC\n+\n'&'*+\n@b\nTAGCTTTT\n+\n'&'*+'&'*+\n@c\nGCTA\n+\n'&'*+",
513        ));
514        let mut record = FastQRecord::default();
515        let _ = record.read(&mut x);
516        assert_eq!("a", record.name());
517        assert_eq!(b"AGTC".to_vec(), record.seq());
518        assert_eq!(&b"AGTC".to_vec(), record.seq_raw());
519
520        let _ = record.read(&mut x);
521        assert_eq!("b", record.name());
522        assert_eq!(b"TAGCTTTT".to_vec(), record.seq());
523        assert_eq!(&b"TAGCTTTT".to_vec(), record.seq_raw());
524
525        let _ = record.read(&mut x);
526        assert_eq!("c", record.name());
527        assert_eq!(b"GCTA".to_vec(), record.seq());
528        assert_eq!(&b"GCTA".to_vec(), record.seq_raw());
529    }
530}