bio_rust/
lib.rs

1//! # bio-rust
2//! 解析生物信息领域的基本数据结构,提供操纵这些数据的接口和构建一些统计模型。
3use std::io::{prelude::*, BufReader};
4use std::error::Error;
5use std::fs::File;
6use flate2::read::GzDecoder;
7use flate2::Compression;
8use flate2::write::GzEncoder;
9use std::iter::Iterator;
10extern crate flate2;
11
12#[derive(Debug)]
13pub struct Reads {
14    read_id: String,
15    sequence: String,
16    read_name: String,
17    quality: String,
18}
19
20impl Reads {
21    pub fn len(&self) -> usize{
22        self.sequence.len()
23    }
24
25    pub fn lt_qc(&self, score: i32) -> i32{
26        let bytes = self.quality.as_bytes();
27        let x : Vec<i32> = bytes.iter()
28            .map(|&i| (i as i32 ) - 33i32)
29            .filter(|&i| i >= score)
30            .collect();
31        x.len() as i32
32    }
33
34    fn display(&self) -> String {
35        format!("{}\n{}\n{}\n{}\n", self.read_id, self.sequence, self.read_name, self.quality)
36    }
37}
38
39impl Clone for Reads {
40    fn clone(&self) -> Reads {
41        Reads {
42            read_id: self.read_id.to_string(),
43            sequence: self.sequence.to_string(),
44            read_name: self.read_name.to_string(),
45            quality: self.quality.to_string(),
46        }
47    }
48}
49
50pub struct Fastq {
51    reads: Vec<Reads>,
52    length: u64,
53}
54
55impl Fastq {
56    pub fn new(reads: Vec<Reads>) -> Self{
57        let length = reads.len() as u64;
58        Fastq {
59            reads,
60            length,
61        }
62    }
63
64    fn push(&mut self, reads: Reads) {
65        self.reads.push(reads);
66        self.length += 1u64;
67    }
68
69    pub fn total_base_num(&self) -> u64 {
70        self.reads.iter().map(|r| r.len()).fold(0u64, |acc, x| acc + x as u64)
71    }
72    pub fn qc_num(&self, score: i32) -> u64 {
73        self.reads.iter().map(|r| r.lt_qc(score)).fold(0u64, |acc, x| acc + x as u64)
74    }
75
76    // from file
77    pub fn from_file(file_path: &str) -> Result<Self, Box<dyn Error>> {
78        // todo 目前只支持fastq.gz格式的输入,需要兼容fastq文本格式的输入。
79        let fastq_gz = File::open(file_path).expect(format!("No such file or directory: {}", file_path).as_str());
80        let fastq_content = GzDecoder::new(fastq_gz);
81        let fastq_reader = BufReader::new(fastq_content);
82        let mut line_iter = fastq_reader.lines().map(|l| l.unwrap());
83        let mut fastq = Fastq::new(Vec::new());
84        loop {
85            let read_id: String;
86            let sequence: String;
87            let read_name: String;
88            let quality: String;
89            match line_iter.next(){
90                None => {break;}
91                Some(element) => {
92                    read_id = element;
93                }
94            }
95            match line_iter.next(){
96                None => {break}
97                Some(element) => {
98                    sequence = element;
99                }
100            }
101            match line_iter.next(){
102                None => {break}
103                Some(element) => {
104                    read_name = element;
105                }
106            }
107            match line_iter.next(){
108                None => {break}
109                Some(element) => {
110                    quality = element;
111                }
112            }
113            fastq.push(Reads{
114                read_id,
115                sequence,
116                read_name,
117                quality
118            })
119        }
120        Ok(fastq)
121    }
122
123    pub fn extent(&mut self, other_fastq: &Fastq) {
124        for reads in other_fastq.reads.iter() {
125            self.push(reads.clone())
126        }
127    }
128    // merge fastq
129    pub fn merge_fastq(fastq_vec: Vec<&str>) -> Result<Self, Box<dyn Error>> {
130        let mut ret_fastq = Fastq::new(Vec::new());
131        // todo
132        for fastq_path in fastq_vec.iter() {
133            // 3. 从压缩文件中读取文件
134            let fastq_tmp = Fastq::from_file(fastq_path)?;
135            ret_fastq.extent(&fastq_tmp);
136        }
137        Ok(ret_fastq)
138    }
139
140    pub fn to_file(&self, file_path: &str) -> Result<(), Box<dyn Error>> {
141        // print_fastq_to_file
142        let mut out_encoder = GzEncoder::new(Vec::new(), Compression::default());
143        for reads in self.reads.iter() {
144            out_encoder.write_all(reads.display().as_bytes())?;
145        }
146        // todo
147        let compressed_bytes = out_encoder.finish()?;
148        let mut file = File::create(file_path).expect("create failed");
149        file.write_all(&compressed_bytes).expect("write failed");
150        Ok(())
151    }
152}
153
154impl Iterator for Fastq {
155    type Item = Reads;
156
157    fn next(&mut self) -> Option<Self::Item> {
158        match self.reads.iter().next() {
159            None => { None }
160            Some(reads) => { Some(reads.clone())}
161        }
162    }
163}
164
165
166#[cfg(test)]
167mod tests {
168    use std::error::Error;
169    use crate::Fastq;
170
171    #[test]
172    fn test_merge_fastq() -> Result<(), Box<dyn Error>>{
173        let fastq_file_vec = vec!["data/s1062207050023_1.fastq.gz", "data/s1062207050023_2.fastq.gz"];
174        let m_fastq = Fastq::merge_fastq(fastq_file_vec)?;
175        m_fastq.to_file("data/s1062207050023.fastq.gz");
176        Ok(())
177    }
178
179    #[test]
180    fn test_iter_fastq() -> Result<(), Box<dyn Error>>{
181        let m_fastq = Fastq::from_file("data/s1062207050023_1.fastq.gz")?;
182        for read in m_fastq {
183            println!("{}", read.display());
184            break;
185        }
186        Ok(())
187    }
188}