Skip to main content

biolib/
seq_util.rs

1use std::collections::BTreeMap;
2use std::fs;
3use std::io::Read;
4
5use crate::error::BioLibError;
6
7pub struct SeqUtilRecord {
8    pub sequence: String,
9    pub id: String,
10    pub description: String,
11    pub properties: BTreeMap<String, String>,
12}
13
14impl SeqUtilRecord {
15    pub fn new(
16        sequence: String,
17        sequence_id: String,
18        description: String,
19        properties: Option<BTreeMap<String, String>>,
20    ) -> crate::Result<Self> {
21        let properties = match properties {
22            Some(props) => {
23                for (key, value) in &props {
24                    if key.contains(&['=', '[', ']', '\n'][..]) {
25                        return Err(BioLibError::Validation(
26                            "Key cannot contain characters =[] and newline".to_string(),
27                        ));
28                    }
29                    if value.contains(&['=', '[', ']', '\n'][..]) {
30                        return Err(BioLibError::Validation(
31                            "Value cannot contain characters =[] and newline".to_string(),
32                        ));
33                    }
34                }
35                props
36            }
37            None => BTreeMap::new(),
38        };
39
40        Ok(Self {
41            sequence,
42            id: sequence_id,
43            description,
44            properties,
45        })
46    }
47}
48
49impl std::fmt::Display for SeqUtilRecord {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        write!(f, "SeqUtilRecord ({})", self.id)
52    }
53}
54
55pub struct ParseFastaOptions {
56    pub default_header: Option<String>,
57    pub allow_any_sequence_characters: bool,
58    pub use_strict_alphabet: bool,
59    pub allow_empty_sequence: bool,
60}
61
62impl Default for ParseFastaOptions {
63    fn default() -> Self {
64        Self {
65            default_header: None,
66            allow_any_sequence_characters: false,
67            use_strict_alphabet: false,
68            allow_empty_sequence: true,
69        }
70    }
71}
72
73pub struct SeqUtil;
74
75impl SeqUtil {
76    pub fn parse_fasta(
77        file_path: &str,
78        options: &ParseFastaOptions,
79    ) -> crate::Result<Vec<SeqUtilRecord>> {
80        let content = fs::read_to_string(file_path)?;
81        Self::parse_fasta_str(&content, Some(file_path), options)
82    }
83
84    pub fn parse_fasta_from_reader<R: Read>(
85        reader: R,
86        file_name: Option<&str>,
87        options: &ParseFastaOptions,
88    ) -> crate::Result<Vec<SeqUtilRecord>> {
89        let mut content = String::new();
90        let mut reader = std::io::BufReader::new(reader);
91        reader.read_to_string(&mut content)?;
92        Self::parse_fasta_str(&content, file_name, options)
93    }
94
95    pub fn parse_fasta_str(
96        content: &str,
97        file_name: Option<&str>,
98        options: &ParseFastaOptions,
99    ) -> crate::Result<Vec<SeqUtilRecord>> {
100        if options.allow_any_sequence_characters && options.use_strict_alphabet {
101            return Err(BioLibError::Validation(
102                "Please choose either allow_any_sequence_characters or use_strict_alphabet"
103                    .to_string(),
104            ));
105        }
106
107        let mut records = Vec::new();
108        let mut header: Option<String> = None;
109        let mut sequence_lines: Vec<&str> = Vec::new();
110
111        for (line_number, line) in content.lines().enumerate() {
112            let line = line.trim();
113            if line.is_empty() {
114                continue;
115            }
116
117            if let Some(stripped) = line.strip_prefix('>') {
118                if let Some(h) = header.take() {
119                    records.push(Self::build_record(&h, &sequence_lines, options)?);
120                }
121                header = Some(stripped.trim().to_string());
122                sequence_lines.clear();
123            } else if header.is_some() {
124                sequence_lines.push(line);
125            } else if let Some(ref default_header) = options.default_header {
126                let h = format!("{default_header}{line_number}");
127                records.push(Self::build_record(&h, &[line], options)?);
128            } else {
129                let name = file_name.unwrap_or("unknown");
130                return Err(BioLibError::Validation(format!(
131                    "No header line found in FASTA file \"{name}\""
132                )));
133            }
134        }
135
136        if let Some(h) = header {
137            records.push(Self::build_record(&h, &sequence_lines, options)?);
138        }
139
140        Ok(records)
141    }
142
143    pub fn write_records_to_fasta(file_path: &str, records: &[SeqUtilRecord]) -> crate::Result<()> {
144        let mut content = String::new();
145        for record in records {
146            content.push('>');
147            content.push_str(&record.id);
148            if !record.description.is_empty() {
149                content.push(' ');
150                content.push_str(&record.description);
151            }
152            for (key, value) in &record.properties {
153                content.push_str(&format!(" [{key}={value}]"));
154            }
155            content.push('\n');
156            let chars: Vec<char> = record.sequence.chars().collect();
157            let lines: Vec<String> = chars.chunks(80).map(|c| c.iter().collect()).collect();
158            content.push_str(&lines.join("\n"));
159            content.push('\n');
160        }
161        fs::write(file_path, content)?;
162        Ok(())
163    }
164
165    fn build_record(
166        header: &str,
167        sequence_lines: &[&str],
168        options: &ParseFastaOptions,
169    ) -> crate::Result<SeqUtilRecord> {
170        let sequence: String = sequence_lines.concat();
171        let sequence_id = header.split_whitespace().next().unwrap_or(header);
172
173        if !options.allow_any_sequence_characters {
174            let invalid_chars = if options.use_strict_alphabet {
175                find_invalid_sequence_characters_strict(&sequence)
176            } else {
177                find_invalid_sequence_characters(&sequence)
178            };
179            if let Some(ch) = invalid_chars.first() {
180                return Err(BioLibError::Validation(format!(
181                    "Invalid character (\"{ch}\") found in sequence {sequence_id}"
182                )));
183            }
184        }
185
186        if !options.allow_empty_sequence && sequence.is_empty() {
187            return Err(BioLibError::Validation(format!(
188                "No sequence found for fasta entry {sequence_id}"
189            )));
190        }
191
192        let description = header[sequence_id.len()..].trim().to_string();
193        SeqUtilRecord::new(sequence, sequence_id.to_string(), description, None)
194    }
195}
196
197fn find_invalid_sequence_characters(sequence: &str) -> Vec<char> {
198    sequence
199        .chars()
200        .filter(|c| !c.is_ascii_alphanumeric() && *c != '-' && *c != '_' && *c != '.')
201        .collect()
202}
203
204// Equivalent to fair-esm alphabet, compatible with ESM-models
205// https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/constants.py#L8
206fn find_invalid_sequence_characters_strict(sequence: &str) -> Vec<char> {
207    const ALLOWED: &str = "lagvsertidpkqnfymhwcxbuzoLAGVSERTIDPKQNFYMHWCXBUZO-.";
208    sequence.chars().filter(|c| !ALLOWED.contains(*c)).collect()
209}