1use crate::errors;
6use crate::helpers::seq_id_from_description;
7use crate::read::FastaReader;
8
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::error;
12use std::fs::File;
13use std::io;
14use std::io::prelude::Seek;
15use std::io::BufWriter;
16use std::io::{BufRead, BufReader, SeekFrom, Write};
17use std::path::Path;
18
19#[derive(Debug)]
34pub struct FastaAccessions {
35 pub accessions: Vec<String>,
36}
37
38impl FastaAccessions {
39 pub fn from_fasta(path: &Path, separator: &str, id_index: usize) -> Self {
40 let reader = FastaReader::new(path);
41 let mut accessions = Vec::new();
42 for [header, _seq] in reader {
43 accessions.push(seq_id_from_description(&header, separator, id_index).to_string());
44 }
45 FastaAccessions { accessions }
46 }
47
48 pub fn to_json(&self, outpath: &Path) -> Result<(), io::Error> {
50 let mut file = BufWriter::new(File::create(&outpath)?);
51 serde_json::to_writer(&mut file, &self.accessions)?;
52 Ok(())
53 }
54
55 pub fn to_tsv(&self, outpath: &Path) -> Result<(), io::Error> {
57 let mut file = BufWriter::new(File::create(&outpath)?);
58 for id in &self.accessions {
59 file.write_all(format!("{}\n", id).as_bytes())?;
60 }
61 Ok(())
62 }
63}
64
65#[derive(Debug, PartialEq)]
78pub struct FastaLengths {
79 pub sequence_lengths: HashMap<String, usize>,
80}
81
82impl FastaLengths {
83 pub fn from_fasta(path: &Path, separator: &str, id_index: usize) -> Self {
84 let reader = FastaReader::new(path);
85 let mut entries: HashMap<String, usize> = HashMap::new();
86 for [header, seq] in reader {
87 entries.insert(
88 seq_id_from_description(&header, separator, id_index).to_string(),
89 seq.len(),
90 );
91 }
92 FastaLengths {
93 sequence_lengths: entries,
94 }
95 }
96
97 pub fn to_json(&self, outpath: &Path) -> Result<(), io::Error> {
99 let mut file = BufWriter::new(File::create(&outpath)?);
100 serde_json::to_writer(&mut file, &self.sequence_lengths)?;
101 Ok(())
102 }
103
104 pub fn distribution_to_json(&self, outpath: &Path) -> Result<(), io::Error> {
106 let mut len_counts = HashMap::new();
107 for len in self.sequence_lengths.values() {
108 *len_counts.entry(len).or_insert(0) += 1;
109 }
110 let mut file = BufWriter::new(File::create(&outpath)?);
111 serde_json::to_writer(&mut file, &len_counts)?;
112 Ok(())
113 }
114
115 pub fn max(&self) -> Option<&usize> {
116 self.sequence_lengths.values().max()
117 }
118}
119
120#[derive(Debug, Serialize, Deserialize, PartialEq)]
146pub struct FastaEntry {
147 pub description: String,
148 pub sequence: String,
149}
150
151impl FastaEntry {
152 pub fn from_index(data: &Path, index: u64) -> Result<Self, Box<dyn error::Error>> {
153 let mut handle = BufReader::new(File::open(data)?);
154 handle.seek(SeekFrom::Start(index))?;
155
156 let mut lines = handle.lines();
157 let line = lines.next().unwrap().unwrap();
158 let description = if line.starts_with('>') {
159 line
160 } else {
161 return Err(Box::new(errors::ParseError::new(
162 errors::ErrorKind::IndexNotAtDescription,
163 "No description line found at index when reading entry!",
164 )));
165 };
166
167 let mut entry = FastaEntry {
168 description,
169 sequence: String::new(),
170 };
171
172 for l in lines {
173 let line = l.unwrap();
174 if line == "" || line.starts_with('>') {
175 break;
176 } else {
177 entry.sequence.push_str(&line);
178 }
179 }
180
181 Ok(entry)
182 }
183}
184
185#[cfg(test)]
186mod tests {
187 use super::*;
188
189 #[test]
190 fn accessions_from_fasta_short() {
191 assert_eq!(
192 FastaAccessions::from_fasta(Path::new("./resources/test_short_descr.fasta"), "|", 1)
193 .accessions,
194 vec!["Q2HZH0", "P93158", "H0VS30"]
195 )
196 }
197
198 #[test]
199 fn accessions_from_fasta_long() {
200 assert_eq!(
201 FastaAccessions::from_fasta(Path::new("./resources/test.fasta"), "|", 1).accessions,
202 vec!["Q2HZH0", "P93158", "H0VS30"]
203 )
204 }
205
206 #[test]
207 fn get_single_fasta_entry() {
208 let index =
209 crate::index::FastaIndex::from_json(Path::new("./resources/test.index")).unwrap();
210 let entry = FastaEntry::from_index(
211 Path::new("./resources/test.fasta"),
212 *index.id_to_offset.get("P93158").unwrap(),
213 )
214 .unwrap();
215 let expected = FastaEntry {
216 description: ">tr|P93158|P93158_GOSHI Annexin (Fragment) OS=Gossypium \
217 hirsutum OX=3635 GN=AnnGh2 PE=2 SV=1"
218 .to_string(),
219 sequence: "TLKVPVHVPSPSEDAEWQLRKAFEGWGTNEQLIIDILAHRNAAQRNSIRKVYGEAYGEDL\
220 LKCLEKELTSDFERAVLLFTLDPAERDAHLANEATKKFTSSNWILMEIACSRSSHELLNV"
221 .to_string(),
222 };
223 assert_eq!(entry, expected);
224 }
225
226 #[test]
227 fn lengths_from_fasta() {
228 let lengths = FastaLengths::from_fasta(Path::new("./resources/test.fasta"), "|", 1);
229 let mut exp_map = HashMap::new();
230 exp_map.insert("H0VS30".to_string(), 180);
231 exp_map.insert("Q2HZH0".to_string(), 120);
232 exp_map.insert("P93158".to_string(), 120);
233 assert_eq!(
234 lengths,
235 FastaLengths {
236 sequence_lengths: exp_map
237 }
238 );
239 println!("{:?}", lengths);
240 }
241}