fasta/
pieces.rs

1//! Structs that represent specific pieces of information
2//! found in FASTA files. Useful for extracting and storing
3//! these parts.
4
5use crate::errors;
6use crate::helpers::seq_id_from_description;
7use crate::read::FastaReader;
8
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::error;
12use std::fs::File;
13use std::io;
14use std::io::prelude::Seek;
15use std::io::BufWriter;
16use std::io::{BufRead, BufReader, SeekFrom, Write};
17use std::path::Path;
18
19/// A convenience struct for parsing the acession ids from FASTA description lines.
20///
21/// # Examples
22/// Extract the accession ids from a UniProt formatted FASTA and write them to
23/// a tsv file, one per line.
24/// ```
25/// use fasta::pieces::FastaAccessions;
26/// use std::path::Path;
27///
28/// // parse the accessions
29/// let accessions = FastaAccessions::from_fasta(Path::new("./resources/test.fasta"), "|", 1);
30/// // write to tsv
31/// accessions.to_tsv(Path::new("./resources/test.accessions")).expect("Dumping tsv failed");
32/// ```
33#[derive(Debug)]
34pub struct FastaAccessions {
35    pub accessions: Vec<String>,
36}
37
38impl FastaAccessions {
39    pub fn from_fasta(path: &Path, separator: &str, id_index: usize) -> Self {
40        let reader = FastaReader::new(path);
41        let mut accessions = Vec::new();
42        for [header, _seq] in reader {
43            accessions.push(seq_id_from_description(&header, separator, id_index).to_string());
44        }
45        FastaAccessions { accessions }
46    }
47
48    /// Writes the accessions to json.
49    pub fn to_json(&self, outpath: &Path) -> Result<(), io::Error> {
50        let mut file = BufWriter::new(File::create(&outpath)?);
51        serde_json::to_writer(&mut file, &self.accessions)?;
52        Ok(())
53    }
54
55    /// Writes the accessions to a txt file, one per line.
56    pub fn to_tsv(&self, outpath: &Path) -> Result<(), io::Error> {
57        let mut file = BufWriter::new(File::create(&outpath)?);
58        for id in &self.accessions {
59            file.write_all(format!("{}\n", id).as_bytes())?;
60        }
61        Ok(())
62    }
63}
64
65/// A convenient struct that wraps a sequence id to sequence length mapping.
66///
67/// # Examples
68/// ```
69/// use std::path::Path;
70/// use fasta::pieces::FastaLengths;
71///
72/// // parse a fasta file
73/// let lengths = FastaLengths::from_fasta(Path::new("./resources/test.fasta"), "|", 1);
74/// // write to json
75/// lengths.to_json(Path::new("./resources/test.accessions")).expect("JSON dump failed");
76/// ```
77#[derive(Debug, PartialEq)]
78pub struct FastaLengths {
79    pub sequence_lengths: HashMap<String, usize>,
80}
81
82impl FastaLengths {
83    pub fn from_fasta(path: &Path, separator: &str, id_index: usize) -> Self {
84        let reader = FastaReader::new(path);
85        let mut entries: HashMap<String, usize> = HashMap::new();
86        for [header, seq] in reader {
87            entries.insert(
88                seq_id_from_description(&header, separator, id_index).to_string(),
89                seq.len(),
90            );
91        }
92        FastaLengths {
93            sequence_lengths: entries,
94        }
95    }
96
97    /// Writes the ID -> Sequence length mapping to .json.
98    pub fn to_json(&self, outpath: &Path) -> Result<(), io::Error> {
99        let mut file = BufWriter::new(File::create(&outpath)?);
100        serde_json::to_writer(&mut file, &self.sequence_lengths)?;
101        Ok(())
102    }
103
104    /// Writes lengths and their counts to a .json file.
105    pub fn distribution_to_json(&self, outpath: &Path) -> Result<(), io::Error> {
106        let mut len_counts = HashMap::new();
107        for len in self.sequence_lengths.values() {
108            *len_counts.entry(len).or_insert(0) += 1;
109        }
110        let mut file = BufWriter::new(File::create(&outpath)?);
111        serde_json::to_writer(&mut file, &len_counts)?;
112        Ok(())
113    }
114
115    pub fn max(&self) -> Option<&usize> {
116        self.sequence_lengths.values().max()
117    }
118}
119
120/// A single FASTA entry with description and header.
121///
122/// # Examples
123/// Read a speciic entry from an indexed FASTA file:
124/// ```
125/// use fasta::index::FastaIndex;
126/// use std::path::Path;
127/// use fasta::pieces::FastaEntry;
128/// let index =
129///     FastaIndex::from_json(Path::new("./resources/test.index")).unwrap();
130/// let entry = FastaEntry::from_index(
131///     Path::new("./resources/test.fasta"),
132///     *index.id_to_offset.get("P93158").unwrap(),
133/// )
134/// .unwrap();
135/// let expected = FastaEntry {
136///     description: ">tr|P93158|P93158_GOSHI Annexin (Fragment) OS=Gossypium \
137///     hirsutum OX=3635 GN=AnnGh2 PE=2 SV=1"
138///         .to_string(),
139///     sequence: "TLKVPVHVPSPSEDAEWQLRKAFEGWGTNEQLIIDILAHRNAAQRNSIRKVYGEAYGEDL\
140///     LKCLEKELTSDFERAVLLFTLDPAERDAHLANEATKKFTSSNWILMEIACSRSSHELLNV"
141///         .to_string(),
142/// };
143/// assert_eq!(entry, expected);
144/// ```
145#[derive(Debug, Serialize, Deserialize, PartialEq)]
146pub struct FastaEntry {
147    pub description: String,
148    pub sequence: String,
149}
150
151impl FastaEntry {
152    pub fn from_index(data: &Path, index: u64) -> Result<Self, Box<dyn error::Error>> {
153        let mut handle = BufReader::new(File::open(data)?);
154        handle.seek(SeekFrom::Start(index))?;
155
156        let mut lines = handle.lines();
157        let line = lines.next().unwrap().unwrap();
158        let description = if line.starts_with('>') {
159            line
160        } else {
161            return Err(Box::new(errors::ParseError::new(
162                errors::ErrorKind::IndexNotAtDescription,
163                "No description line found at index when reading entry!",
164            )));
165        };
166
167        let mut entry = FastaEntry {
168            description,
169            sequence: String::new(),
170        };
171
172        for l in lines {
173            let line = l.unwrap();
174            if line == "" || line.starts_with('>') {
175                break;
176            } else {
177                entry.sequence.push_str(&line);
178            }
179        }
180
181        Ok(entry)
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn accessions_from_fasta_short() {
191        assert_eq!(
192            FastaAccessions::from_fasta(Path::new("./resources/test_short_descr.fasta"), "|", 1)
193                .accessions,
194            vec!["Q2HZH0", "P93158", "H0VS30"]
195        )
196    }
197
198    #[test]
199    fn accessions_from_fasta_long() {
200        assert_eq!(
201            FastaAccessions::from_fasta(Path::new("./resources/test.fasta"), "|", 1).accessions,
202            vec!["Q2HZH0", "P93158", "H0VS30"]
203        )
204    }
205
206    #[test]
207    fn get_single_fasta_entry() {
208        let index =
209            crate::index::FastaIndex::from_json(Path::new("./resources/test.index")).unwrap();
210        let entry = FastaEntry::from_index(
211            Path::new("./resources/test.fasta"),
212            *index.id_to_offset.get("P93158").unwrap(),
213        )
214        .unwrap();
215        let expected = FastaEntry {
216            description: ">tr|P93158|P93158_GOSHI Annexin (Fragment) OS=Gossypium \
217            hirsutum OX=3635 GN=AnnGh2 PE=2 SV=1"
218                .to_string(),
219            sequence: "TLKVPVHVPSPSEDAEWQLRKAFEGWGTNEQLIIDILAHRNAAQRNSIRKVYGEAYGEDL\
220            LKCLEKELTSDFERAVLLFTLDPAERDAHLANEATKKFTSSNWILMEIACSRSSHELLNV"
221                .to_string(),
222        };
223        assert_eq!(entry, expected);
224    }
225
226    #[test]
227    fn lengths_from_fasta() {
228        let lengths = FastaLengths::from_fasta(Path::new("./resources/test.fasta"), "|", 1);
229        let mut exp_map = HashMap::new();
230        exp_map.insert("H0VS30".to_string(), 180);
231        exp_map.insert("Q2HZH0".to_string(), 120);
232        exp_map.insert("P93158".to_string(), 120);
233        assert_eq!(
234            lengths,
235            FastaLengths {
236                sequence_lengths: exp_map
237            }
238        );
239        println!("{:?}", lengths);
240    }
241}