use std::fs::File;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
use thiserror::Error;
use crate::model::{Alignment, Sequence};
#[derive(Error, Debug)]
pub enum FastaError {
#[error("Failed to open file: {0}")]
IoError(#[from] std::io::Error),
#[error("Empty FASTA file")]
EmptyFile,
#[error("Invalid FASTA format: {0}")]
InvalidFormat(String),
#[error("Sequence without header at line {0}")]
SequenceWithoutHeader(usize),
}
pub type FastaResult<T> = Result<T, FastaError>;
pub fn parse_fasta_file<P: AsRef<Path>>(path: P) -> FastaResult<Alignment> {
let file = File::open(&path)?;
let metadata = file.metadata()?;
let file_size = metadata.len() as usize;
if file_size > 1_000_000 {
let mut reader = BufReader::with_capacity(1024 * 1024, file); let mut content = String::with_capacity(file_size);
reader.read_to_string(&mut content)?;
parse_fasta_fast(&content)
} else {
let reader = BufReader::new(file);
parse_fasta(reader)
}
}
pub fn parse_fasta_fast(content: &str) -> FastaResult<Alignment> {
let estimated_seqs = (content.len() / 1000).max(10);
let mut sequences = Vec::with_capacity(estimated_seqs);
let mut current_id: Option<&str> = None;
let mut current_seq: Vec<u8> = Vec::new();
let mut line_number = 0;
let mut prev_seq_len: usize = 1000;
for line in content.lines() {
line_number += 1;
let line = line.trim();
if line.is_empty() {
continue;
}
if let Some(header) = line.strip_prefix('>') {
if let Some(id) = current_id.take() {
if !current_seq.is_empty() {
prev_seq_len = current_seq.len(); current_seq.shrink_to_fit(); sequences.push(Sequence::from_bytes(id, std::mem::take(&mut current_seq)));
}
}
let id = header.split_whitespace().next().unwrap_or(header);
if id.is_empty() {
return Err(FastaError::InvalidFormat(format!(
"Empty sequence identifier at line {}",
line_number
)));
}
current_id = Some(id);
current_seq = Vec::with_capacity(prev_seq_len.max(1000));
} else {
if current_id.is_none() {
return Err(FastaError::SequenceWithoutHeader(line_number));
}
if line.bytes().all(|b| !b.is_ascii_whitespace()) {
current_seq.extend_from_slice(line.as_bytes());
} else {
current_seq.extend(line.bytes().filter(|b| !b.is_ascii_whitespace()));
}
}
}
if let Some(id) = current_id {
if !current_seq.is_empty() {
current_seq.shrink_to_fit(); sequences.push(Sequence::from_bytes(id, current_seq));
}
}
if sequences.is_empty() {
return Err(FastaError::EmptyFile);
}
sequences.shrink_to_fit(); Ok(Alignment::new(sequences))
}
pub fn parse_fasta<R: BufRead>(reader: R) -> FastaResult<Alignment> {
let mut sequences = Vec::new();
let mut current_id: Option<String> = None;
let mut current_seq: Vec<u8> = Vec::new();
let mut line_number = 0;
let mut prev_seq_len: usize = 1000;
for line_result in reader.lines() {
line_number += 1;
let line = line_result?;
let line = line.trim();
if line.is_empty() {
continue;
}
if line.starts_with('>') {
if let Some(id) = current_id.take() {
if !current_seq.is_empty() {
prev_seq_len = current_seq.len(); current_seq.shrink_to_fit(); sequences.push(Sequence::from_bytes(id, std::mem::take(&mut current_seq)));
}
}
let header = &line[1..];
let id = header
.split_whitespace()
.next()
.unwrap_or(header)
.to_string();
if id.is_empty() {
return Err(FastaError::InvalidFormat(format!(
"Empty sequence identifier at line {}",
line_number
)));
}
current_id = Some(id);
current_seq = Vec::with_capacity(prev_seq_len.max(1000));
} else {
if current_id.is_none() {
return Err(FastaError::SequenceWithoutHeader(line_number));
}
if line.bytes().all(|b| !b.is_ascii_whitespace()) {
current_seq.extend_from_slice(line.as_bytes());
} else {
current_seq.extend(line.bytes().filter(|b| !b.is_ascii_whitespace()));
}
}
}
if let Some(id) = current_id {
if !current_seq.is_empty() {
current_seq.shrink_to_fit(); sequences.push(Sequence::from_bytes(id, current_seq));
}
}
if sequences.is_empty() {
return Err(FastaError::EmptyFile);
}
sequences.shrink_to_fit(); Ok(Alignment::new(sequences))
}
pub fn parse_fasta_str(content: &str) -> FastaResult<Alignment> {
parse_fasta_fast(content)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_fasta() {
let content = ">seq1\nACGT\n>seq2\nTGCA\n";
let alignment = parse_fasta_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
assert_eq!(alignment.get(0).unwrap().id, "seq1");
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGT");
assert_eq!(alignment.get(1).unwrap().id, "seq2");
assert_eq!(alignment.get(1).unwrap().as_str(), "TGCA");
}
#[test]
fn test_parse_multiline_sequence() {
let content = ">seq1\nACGT\nTGCA\nAAAA\n";
let alignment = parse_fasta_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 1);
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTTGCAAAAA");
}
#[test]
fn test_parse_with_description() {
let content = ">seq1 This is a description\nACGT\n";
let alignment = parse_fasta_str(content).unwrap();
assert_eq!(alignment.get(0).unwrap().id, "seq1");
}
#[test]
fn test_parse_with_empty_lines() {
let content = ">seq1\nACGT\n\n>seq2\n\nTGCA\n";
let alignment = parse_fasta_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGT");
assert_eq!(alignment.get(1).unwrap().as_str(), "TGCA");
}
#[test]
fn test_empty_file() {
let content = "";
let result = parse_fasta_str(content);
assert!(matches!(result, Err(FastaError::EmptyFile)));
}
#[test]
fn test_sequence_without_header() {
let content = "ACGT\n>seq1\nTGCA\n";
let result = parse_fasta_str(content);
assert!(matches!(result, Err(FastaError::SequenceWithoutHeader(_))));
}
#[test]
fn test_alignment_validation() {
let content = ">seq1\nACGT\n>seq2\nTGCA\n";
let alignment = parse_fasta_str(content).unwrap();
assert!(alignment.is_valid_alignment);
let content = ">seq1\nACGT\n>seq2\nTG\n";
let alignment = parse_fasta_str(content).unwrap();
assert!(!alignment.is_valid_alignment);
assert!(alignment.warning.is_some());
}
#[test]
fn test_uppercase_preservation() {
let content = ">seq1\nacgt\n";
let alignment = parse_fasta_str(content).unwrap();
assert_eq!(alignment.get(0).unwrap().as_str(), "acgt");
}
}