1use std::collections::BTreeMap;
2use std::fs;
3use std::io::Read;
4
5use crate::error::BioLibError;
6
7pub struct SeqUtilRecord {
8 pub sequence: String,
9 pub id: String,
10 pub description: String,
11 pub properties: BTreeMap<String, String>,
12}
13
14impl SeqUtilRecord {
15 pub fn new(
16 sequence: String,
17 sequence_id: String,
18 description: String,
19 properties: Option<BTreeMap<String, String>>,
20 ) -> crate::Result<Self> {
21 let properties = match properties {
22 Some(props) => {
23 for (key, value) in &props {
24 if key.contains(&['=', '[', ']', '\n'][..]) {
25 return Err(BioLibError::Validation(
26 "Key cannot contain characters =[] and newline".to_string(),
27 ));
28 }
29 if value.contains(&['=', '[', ']', '\n'][..]) {
30 return Err(BioLibError::Validation(
31 "Value cannot contain characters =[] and newline".to_string(),
32 ));
33 }
34 }
35 props
36 }
37 None => BTreeMap::new(),
38 };
39
40 Ok(Self {
41 sequence,
42 id: sequence_id,
43 description,
44 properties,
45 })
46 }
47}
48
49impl std::fmt::Display for SeqUtilRecord {
50 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51 write!(f, "SeqUtilRecord ({})", self.id)
52 }
53}
54
55pub struct ParseFastaOptions {
56 pub default_header: Option<String>,
57 pub allow_any_sequence_characters: bool,
58 pub use_strict_alphabet: bool,
59 pub allow_empty_sequence: bool,
60}
61
62impl Default for ParseFastaOptions {
63 fn default() -> Self {
64 Self {
65 default_header: None,
66 allow_any_sequence_characters: false,
67 use_strict_alphabet: false,
68 allow_empty_sequence: true,
69 }
70 }
71}
72
73pub struct SeqUtil;
74
75impl SeqUtil {
76 pub fn parse_fasta(
77 file_path: &str,
78 options: &ParseFastaOptions,
79 ) -> crate::Result<Vec<SeqUtilRecord>> {
80 let content = fs::read_to_string(file_path)?;
81 Self::parse_fasta_str(&content, Some(file_path), options)
82 }
83
84 pub fn parse_fasta_from_reader<R: Read>(
85 reader: R,
86 file_name: Option<&str>,
87 options: &ParseFastaOptions,
88 ) -> crate::Result<Vec<SeqUtilRecord>> {
89 let mut content = String::new();
90 let mut reader = std::io::BufReader::new(reader);
91 reader.read_to_string(&mut content)?;
92 Self::parse_fasta_str(&content, file_name, options)
93 }
94
95 pub fn parse_fasta_str(
96 content: &str,
97 file_name: Option<&str>,
98 options: &ParseFastaOptions,
99 ) -> crate::Result<Vec<SeqUtilRecord>> {
100 if options.allow_any_sequence_characters && options.use_strict_alphabet {
101 return Err(BioLibError::Validation(
102 "Please choose either allow_any_sequence_characters or use_strict_alphabet"
103 .to_string(),
104 ));
105 }
106
107 let mut records = Vec::new();
108 let mut header: Option<String> = None;
109 let mut sequence_lines: Vec<&str> = Vec::new();
110
111 for (line_number, line) in content.lines().enumerate() {
112 let line = line.trim();
113 if line.is_empty() {
114 continue;
115 }
116
117 if let Some(stripped) = line.strip_prefix('>') {
118 if let Some(h) = header.take() {
119 records.push(Self::build_record(&h, &sequence_lines, options)?);
120 }
121 header = Some(stripped.trim().to_string());
122 sequence_lines.clear();
123 } else if header.is_some() {
124 sequence_lines.push(line);
125 } else if let Some(ref default_header) = options.default_header {
126 let h = format!("{default_header}{line_number}");
127 records.push(Self::build_record(&h, &[line], options)?);
128 } else {
129 let name = file_name.unwrap_or("unknown");
130 return Err(BioLibError::Validation(format!(
131 "No header line found in FASTA file \"{name}\""
132 )));
133 }
134 }
135
136 if let Some(h) = header {
137 records.push(Self::build_record(&h, &sequence_lines, options)?);
138 }
139
140 Ok(records)
141 }
142
143 pub fn write_records_to_fasta(file_path: &str, records: &[SeqUtilRecord]) -> crate::Result<()> {
144 let mut content = String::new();
145 for record in records {
146 content.push('>');
147 content.push_str(&record.id);
148 if !record.description.is_empty() {
149 content.push(' ');
150 content.push_str(&record.description);
151 }
152 for (key, value) in &record.properties {
153 content.push_str(&format!(" [{key}={value}]"));
154 }
155 content.push('\n');
156 let chars: Vec<char> = record.sequence.chars().collect();
157 let lines: Vec<String> = chars.chunks(80).map(|c| c.iter().collect()).collect();
158 content.push_str(&lines.join("\n"));
159 content.push('\n');
160 }
161 fs::write(file_path, content)?;
162 Ok(())
163 }
164
165 fn build_record(
166 header: &str,
167 sequence_lines: &[&str],
168 options: &ParseFastaOptions,
169 ) -> crate::Result<SeqUtilRecord> {
170 let sequence: String = sequence_lines.concat();
171 let sequence_id = header.split_whitespace().next().unwrap_or(header);
172
173 if !options.allow_any_sequence_characters {
174 let invalid_chars = if options.use_strict_alphabet {
175 find_invalid_sequence_characters_strict(&sequence)
176 } else {
177 find_invalid_sequence_characters(&sequence)
178 };
179 if let Some(ch) = invalid_chars.first() {
180 return Err(BioLibError::Validation(format!(
181 "Invalid character (\"{ch}\") found in sequence {sequence_id}"
182 )));
183 }
184 }
185
186 if !options.allow_empty_sequence && sequence.is_empty() {
187 return Err(BioLibError::Validation(format!(
188 "No sequence found for fasta entry {sequence_id}"
189 )));
190 }
191
192 let description = header[sequence_id.len()..].trim().to_string();
193 SeqUtilRecord::new(sequence, sequence_id.to_string(), description, None)
194 }
195}
196
197fn find_invalid_sequence_characters(sequence: &str) -> Vec<char> {
198 sequence
199 .chars()
200 .filter(|c| !c.is_ascii_alphanumeric() && *c != '-' && *c != '_' && *c != '.')
201 .collect()
202}
203
204fn find_invalid_sequence_characters_strict(sequence: &str) -> Vec<char> {
207 const ALLOWED: &str = "lagvsertidpkqnfymhwcxbuzoLAGVSERTIDPKQNFYMHWCXBUZO-.";
208 sequence.chars().filter(|c| !ALLOWED.contains(*c)).collect()
209}