bio_streams/
lib.rs

1//! # bio-steams
2//!
3//! ### Types and datastructures for streaming genomics data
4//!
5//! #### This crate is in early development. Contributions are very welcome.
6//!
7//! Webassembly examples: [Remove non M. TB reads from streaming fastqs](https://jeff-k.github.io/fqdemo/), [amplicon based SARS-CoV-2 assembly](https://jeff-k.github.io/amplicon-tiling/)
8//!
9//! ## Features
10//!
11//! Shared `Record` type by `Fastq` and `Fasta` streams:
12//!
13//! ```
14//! use bio_streams::record::Phred;
15//!
16//! pub struct Record<T: for<'a> TryFrom<&'a [u8]> = Vec<u8>> {
17//!    pub fields: Vec<u8>,
18//!    pub seq: T,
19//!    pub quality: Option<Vec<Phred>>, // fasta records set quality to `None`//!
20//! }
21//! ```
22//!
23//! Records can be read into custom types: `pub struct Fastq<R: BufRead, T = Seq<Dna>>`
24//!
25//! ## Examples
26//!
27//! ### Stream a pair of fastqs and check some conditions on their name fields
28//! ```text
29//! // Open a pair of gzipped fastq files as streams of `Record`s with `Seq<Dna>` sequences
30//!
31//! let fq1: Fastq<BufReader<MultiGzDecoder<File>>> = Fastq::new(BufReader::new(
32//!    MultiGzDecoder::new(File::open(&file1).unwrap()),
33//! ));
34//!
35//! let fq2: Fastq<BufReader<MultiGzDecoder<File>>> = Fastq::new(BufReader::new(
36//!    MultiGzDecoder::new(File::open(&file2).unwrap()),
37//! ));
38//!
39//! for zipped in fq1.zip(fq2) {
40//!     match zipped {
41//!         (Ok(r1), Ok(r2)) => {
42//!            // check that the last characters of the name strings are 1 and 2
43//!            if r1.fields[r1.fields.len() - 1] != b'1' || r2.fields[r2.fields.len() - 1] != b'2'
44//!            {
45//!                eprintln!("paired records do not end in 1/2");
46//!            }
47//!
48//!            // check that the description fields are equal up to the last character
49//!            if r1.fields[..r1.fields.len() - 1] != r2.fields[..r2.fields.len() - 1] {
50//!                eprintln!("reads do not have the same names");
51//!            }
52//!         }
53//!         _ => {
54//!             eprintln!("Parse error in fastq files");
55//!         }
56//!     }
57//! }
58//! ```
59//!
60//! ### Count amino acid k-mers
61//!
62//! ```text
63//! // this opens a gzipped data stream and parses it into `Records` with `Seq<Amino>` sequence fields
64//! let faa: Fasta<BufReader<File>, Seq<Amino>> =
65//!     Fasta::new(BufReader::new(File::open(&faa_file).unwrap()));
66//!
67//! // we can convert amino acid k-mers directly into usizes and use them to index into a table
68//! let mut histogram = Box::new([0u64; 1 << (K * Amino::BITS as usize)]);
69//!
70//! for contig in faa {
71//!    // here "contig" is a fasta record
72//!    for kmer in contig.unwrap().seq.kmers::<K>() {
73//!        histogram[usize::from(kmer)] += 1;
74//!    }
75//! }
76//! ```
77//!
78mod error;
79pub mod fasta;
80pub mod fastq;
81pub mod record;
82//pub mod sam;
83
84pub use crate::error::FastxError;
85pub use crate::record::{Reader, Record};