use std::collections::BTreeMap;
use std::fs;
use std::io::Read;
use crate::error::BioLibError;
pub struct SeqUtilRecord {
pub sequence: String,
pub id: String,
pub description: String,
pub properties: BTreeMap<String, String>,
}
impl SeqUtilRecord {
pub fn new(
sequence: String,
sequence_id: String,
description: String,
properties: Option<BTreeMap<String, String>>,
) -> crate::Result<Self> {
let properties = match properties {
Some(props) => {
for (key, value) in &props {
if key.contains(&['=', '[', ']', '\n'][..]) {
return Err(BioLibError::Validation(
"Key cannot contain characters =[] and newline".to_string(),
));
}
if value.contains(&['=', '[', ']', '\n'][..]) {
return Err(BioLibError::Validation(
"Value cannot contain characters =[] and newline".to_string(),
));
}
}
props
}
None => BTreeMap::new(),
};
Ok(Self {
sequence,
id: sequence_id,
description,
properties,
})
}
}
impl std::fmt::Display for SeqUtilRecord {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "SeqUtilRecord ({})", self.id)
}
}
pub struct ParseFastaOptions {
pub default_header: Option<String>,
pub allow_any_sequence_characters: bool,
pub use_strict_alphabet: bool,
pub allow_empty_sequence: bool,
}
impl Default for ParseFastaOptions {
fn default() -> Self {
Self {
default_header: None,
allow_any_sequence_characters: false,
use_strict_alphabet: false,
allow_empty_sequence: true,
}
}
}
pub struct SeqUtil;
impl SeqUtil {
pub fn parse_fasta(
file_path: &str,
options: &ParseFastaOptions,
) -> crate::Result<Vec<SeqUtilRecord>> {
let content = fs::read_to_string(file_path)?;
Self::parse_fasta_str(&content, Some(file_path), options)
}
pub fn parse_fasta_from_reader<R: Read>(
reader: R,
file_name: Option<&str>,
options: &ParseFastaOptions,
) -> crate::Result<Vec<SeqUtilRecord>> {
let mut content = String::new();
let mut reader = std::io::BufReader::new(reader);
reader.read_to_string(&mut content)?;
Self::parse_fasta_str(&content, file_name, options)
}
pub fn parse_fasta_str(
content: &str,
file_name: Option<&str>,
options: &ParseFastaOptions,
) -> crate::Result<Vec<SeqUtilRecord>> {
if options.allow_any_sequence_characters && options.use_strict_alphabet {
return Err(BioLibError::Validation(
"Please choose either allow_any_sequence_characters or use_strict_alphabet"
.to_string(),
));
}
let mut records = Vec::new();
let mut header: Option<String> = None;
let mut sequence_lines: Vec<&str> = Vec::new();
for (line_number, line) in content.lines().enumerate() {
let line = line.trim();
if line.is_empty() {
continue;
}
if let Some(stripped) = line.strip_prefix('>') {
if let Some(h) = header.take() {
records.push(Self::build_record(&h, &sequence_lines, options)?);
}
header = Some(stripped.trim().to_string());
sequence_lines.clear();
} else if header.is_some() {
sequence_lines.push(line);
} else if let Some(ref default_header) = options.default_header {
let h = format!("{default_header}{line_number}");
records.push(Self::build_record(&h, &[line], options)?);
} else {
let name = file_name.unwrap_or("unknown");
return Err(BioLibError::Validation(format!(
"No header line found in FASTA file \"{name}\""
)));
}
}
if let Some(h) = header {
records.push(Self::build_record(&h, &sequence_lines, options)?);
}
Ok(records)
}
pub fn write_records_to_fasta(file_path: &str, records: &[SeqUtilRecord]) -> crate::Result<()> {
let mut content = String::new();
for record in records {
content.push('>');
content.push_str(&record.id);
if !record.description.is_empty() {
content.push(' ');
content.push_str(&record.description);
}
for (key, value) in &record.properties {
content.push_str(&format!(" [{key}={value}]"));
}
content.push('\n');
let chars: Vec<char> = record.sequence.chars().collect();
let lines: Vec<String> = chars.chunks(80).map(|c| c.iter().collect()).collect();
content.push_str(&lines.join("\n"));
content.push('\n');
}
fs::write(file_path, content)?;
Ok(())
}
fn build_record(
header: &str,
sequence_lines: &[&str],
options: &ParseFastaOptions,
) -> crate::Result<SeqUtilRecord> {
let sequence: String = sequence_lines.concat();
let sequence_id = header.split_whitespace().next().unwrap_or(header);
if !options.allow_any_sequence_characters {
let invalid_chars = if options.use_strict_alphabet {
find_invalid_sequence_characters_strict(&sequence)
} else {
find_invalid_sequence_characters(&sequence)
};
if let Some(ch) = invalid_chars.first() {
return Err(BioLibError::Validation(format!(
"Invalid character (\"{ch}\") found in sequence {sequence_id}"
)));
}
}
if !options.allow_empty_sequence && sequence.is_empty() {
return Err(BioLibError::Validation(format!(
"No sequence found for fasta entry {sequence_id}"
)));
}
let description = header[sequence_id.len()..].trim().to_string();
SeqUtilRecord::new(sequence, sequence_id.to_string(), description, None)
}
}
fn find_invalid_sequence_characters(sequence: &str) -> Vec<char> {
sequence
.chars()
.filter(|c| !c.is_ascii_alphanumeric() && *c != '-' && *c != '_' && *c != '.')
.collect()
}
fn find_invalid_sequence_characters_strict(sequence: &str) -> Vec<char> {
const ALLOWED: &str = "lagvsertidpkqnfymhwcxbuzoLAGVSERTIDPKQNFYMHWCXBUZO-.";
sequence.chars().filter(|c| !ALLOWED.contains(*c)).collect()
}