biolib 1.3.301

BioLib client library and CLI for running applications on BioLib
Documentation
use std::collections::BTreeMap;
use std::fs;
use std::io::Read;

use crate::error::BioLibError;

pub struct SeqUtilRecord {
    pub sequence: String,
    pub id: String,
    pub description: String,
    pub properties: BTreeMap<String, String>,
}

impl SeqUtilRecord {
    pub fn new(
        sequence: String,
        sequence_id: String,
        description: String,
        properties: Option<BTreeMap<String, String>>,
    ) -> crate::Result<Self> {
        let properties = match properties {
            Some(props) => {
                for (key, value) in &props {
                    if key.contains(&['=', '[', ']', '\n'][..]) {
                        return Err(BioLibError::Validation(
                            "Key cannot contain characters =[] and newline".to_string(),
                        ));
                    }
                    if value.contains(&['=', '[', ']', '\n'][..]) {
                        return Err(BioLibError::Validation(
                            "Value cannot contain characters =[] and newline".to_string(),
                        ));
                    }
                }
                props
            }
            None => BTreeMap::new(),
        };

        Ok(Self {
            sequence,
            id: sequence_id,
            description,
            properties,
        })
    }
}

impl std::fmt::Display for SeqUtilRecord {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "SeqUtilRecord ({})", self.id)
    }
}

pub struct ParseFastaOptions {
    pub default_header: Option<String>,
    pub allow_any_sequence_characters: bool,
    pub use_strict_alphabet: bool,
    pub allow_empty_sequence: bool,
}

impl Default for ParseFastaOptions {
    fn default() -> Self {
        Self {
            default_header: None,
            allow_any_sequence_characters: false,
            use_strict_alphabet: false,
            allow_empty_sequence: true,
        }
    }
}

pub struct SeqUtil;

impl SeqUtil {
    pub fn parse_fasta(
        file_path: &str,
        options: &ParseFastaOptions,
    ) -> crate::Result<Vec<SeqUtilRecord>> {
        let content = fs::read_to_string(file_path)?;
        Self::parse_fasta_str(&content, Some(file_path), options)
    }

    pub fn parse_fasta_from_reader<R: Read>(
        reader: R,
        file_name: Option<&str>,
        options: &ParseFastaOptions,
    ) -> crate::Result<Vec<SeqUtilRecord>> {
        let mut content = String::new();
        let mut reader = std::io::BufReader::new(reader);
        reader.read_to_string(&mut content)?;
        Self::parse_fasta_str(&content, file_name, options)
    }

    pub fn parse_fasta_str(
        content: &str,
        file_name: Option<&str>,
        options: &ParseFastaOptions,
    ) -> crate::Result<Vec<SeqUtilRecord>> {
        if options.allow_any_sequence_characters && options.use_strict_alphabet {
            return Err(BioLibError::Validation(
                "Please choose either allow_any_sequence_characters or use_strict_alphabet"
                    .to_string(),
            ));
        }

        let mut records = Vec::new();
        let mut header: Option<String> = None;
        let mut sequence_lines: Vec<&str> = Vec::new();

        for (line_number, line) in content.lines().enumerate() {
            let line = line.trim();
            if line.is_empty() {
                continue;
            }

            if let Some(stripped) = line.strip_prefix('>') {
                if let Some(h) = header.take() {
                    records.push(Self::build_record(&h, &sequence_lines, options)?);
                }
                header = Some(stripped.trim().to_string());
                sequence_lines.clear();
            } else if header.is_some() {
                sequence_lines.push(line);
            } else if let Some(ref default_header) = options.default_header {
                let h = format!("{default_header}{line_number}");
                records.push(Self::build_record(&h, &[line], options)?);
            } else {
                let name = file_name.unwrap_or("unknown");
                return Err(BioLibError::Validation(format!(
                    "No header line found in FASTA file \"{name}\""
                )));
            }
        }

        if let Some(h) = header {
            records.push(Self::build_record(&h, &sequence_lines, options)?);
        }

        Ok(records)
    }

    pub fn write_records_to_fasta(file_path: &str, records: &[SeqUtilRecord]) -> crate::Result<()> {
        let mut content = String::new();
        for record in records {
            content.push('>');
            content.push_str(&record.id);
            if !record.description.is_empty() {
                content.push(' ');
                content.push_str(&record.description);
            }
            for (key, value) in &record.properties {
                content.push_str(&format!(" [{key}={value}]"));
            }
            content.push('\n');
            let chars: Vec<char> = record.sequence.chars().collect();
            let lines: Vec<String> = chars.chunks(80).map(|c| c.iter().collect()).collect();
            content.push_str(&lines.join("\n"));
            content.push('\n');
        }
        fs::write(file_path, content)?;
        Ok(())
    }

    fn build_record(
        header: &str,
        sequence_lines: &[&str],
        options: &ParseFastaOptions,
    ) -> crate::Result<SeqUtilRecord> {
        let sequence: String = sequence_lines.concat();
        let sequence_id = header.split_whitespace().next().unwrap_or(header);

        if !options.allow_any_sequence_characters {
            let invalid_chars = if options.use_strict_alphabet {
                find_invalid_sequence_characters_strict(&sequence)
            } else {
                find_invalid_sequence_characters(&sequence)
            };
            if let Some(ch) = invalid_chars.first() {
                return Err(BioLibError::Validation(format!(
                    "Invalid character (\"{ch}\") found in sequence {sequence_id}"
                )));
            }
        }

        if !options.allow_empty_sequence && sequence.is_empty() {
            return Err(BioLibError::Validation(format!(
                "No sequence found for fasta entry {sequence_id}"
            )));
        }

        let description = header[sequence_id.len()..].trim().to_string();
        SeqUtilRecord::new(sequence, sequence_id.to_string(), description, None)
    }
}

fn find_invalid_sequence_characters(sequence: &str) -> Vec<char> {
    sequence
        .chars()
        .filter(|c| !c.is_ascii_alphanumeric() && *c != '-' && *c != '_' && *c != '.')
        .collect()
}

// Equivalent to fair-esm alphabet, compatible with ESM-models
// https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/constants.py#L8
fn find_invalid_sequence_characters_strict(sequence: &str) -> Vec<char> {
    const ALLOWED: &str = "lagvsertidpkqnfymhwcxbuzoLAGVSERTIDPKQNFYMHWCXBUZO-.";
    sequence.chars().filter(|c| !ALLOWED.contains(*c)).collect()
}