use std::io;
use std::path::Path;
use bio::io::fasta::Reader;
use bio::io::fasta::Record;
use crate::error::{ProcessingError, ProcessingResult};
pub struct FastaProcessor {
file_path: String,
}
impl FastaProcessor {
pub fn new<P: AsRef<Path>>(file_path: P) -> Self {
Self {
file_path: file_path.as_ref().to_string_lossy().to_string(),
}
}
pub fn process_file<F>(&self, mut processor: F) -> ProcessingResult<()>
where
F: FnMut(&Record) -> ProcessingResult<()>,
{
let file = io::BufReader::new(std::fs::File::open(&self.file_path).map_err(|e| {
ProcessingError::with_context(
format!("Failed to open FASTA file: {}", self.file_path),
e,
)
})?);
let reader = Reader::new(file);
for record_result in reader.records() {
let record = record_result.map_err(|e| {
ProcessingError::with_context(
format!("Error reading FASTA record from file: {}", &self.file_path),
e,
)
})?;
if let Err(e) = processor(&record) {
eprintln!("Error processing record {}: {}", record.id(), e);
return Err(e);
}
}
Ok(())
}
pub fn read_all(&self) -> ProcessingResult<Vec<Record>> {
let mut sequences = Vec::new();
self.process_file(|record| {
sequences.push(record.clone());
Ok(())
})?;
Ok(sequences)
}
pub fn file_path(&self) -> &str {
&self.file_path
}
pub fn file_exists(&self) -> bool {
std::path::Path::new(&self.file_path).exists()
}
pub fn file_size(&self) -> ProcessingResult<u64> {
let metadata = std::fs::metadata(&self.file_path).map_err(|e| {
ProcessingError::with_context(
format!("Failed to get file metadata: {}", self.file_path),
e,
)
})?;
Ok(metadata.len())
}
}
pub fn validate_fasta_file<P: AsRef<Path>>(file_path: P) -> ProcessingResult<()> {
let path = file_path.as_ref();
if !path.exists() {
return Err(ProcessingError::new(format!(
"FASTA file does not exist: {:?}",
path
)));
}
let file = io::BufReader::new(std::fs::File::open(path).map_err(|e| {
ProcessingError::with_context(format!("Failed to open FASTA file: {:?}", path), e)
})?);
let reader = Reader::new(file);
let mut record_count = 0;
for record_result in reader.records() {
let record = record_result.map_err(|e| {
ProcessingError::with_context(
format!(
"Error reading FASTA record during validation: {:?}",
file_path.as_ref()
),
e,
)
})?;
record_count += 1;
if record.id().is_empty() {
eprintln!("Warning: Record {} has empty ID", record_count);
}
if record.seq().is_empty() {
eprintln!("Warning: Record {} has empty sequence", record_count);
}
if record_count >= 10 {
break;
}
}
if record_count == 0 {
return Err(ProcessingError::new("No valid FASTA records found in file"));
}
Ok(())
}
pub fn count_sequences<P: AsRef<Path>>(file_path: P) -> ProcessingResult<usize> {
let path = file_path.as_ref();
let file = io::BufReader::new(std::fs::File::open(path).map_err(|e| {
ProcessingError::with_context(format!("Failed to open FASTA file: {:?}", path), e)
})?);
let reader = Reader::new(file);
let mut count = 0;
for _ in reader.records() {
count += 1;
}
Ok(count)
}
pub fn total_sequence_length<P: AsRef<Path>>(file_path: P) -> ProcessingResult<usize> {
let path = file_path.as_ref();
let file = io::BufReader::new(std::fs::File::open(path).map_err(|e| {
ProcessingError::with_context(format!("Failed to open FASTA file: {:?}", path), e)
})?);
let reader = Reader::new(file);
let mut total_length = 0;
for record_result in reader.records() {
let record = record_result.map_err(|e| {
ProcessingError::with_context(format!("Error reading FASTA record: {:?}", path), e)
})?;
total_length += record.seq().len();
}
Ok(total_length)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_fasta_processor_creation() {
let temp_file = NamedTempFile::new().unwrap();
let processor = FastaProcessor::new(temp_file.path());
assert_eq!(processor.file_path(), temp_file.path().to_string_lossy());
}
#[test]
fn test_read_all_sequences() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file
.write_all(b">seq1\nATGCATGC\n>seq2\nGCTAGCTA\n")
.unwrap();
let processor = FastaProcessor::new(temp_file.path());
let sequences = processor.read_all().unwrap();
assert_eq!(sequences.len(), 2);
assert_eq!(sequences[0].id(), "seq1");
assert_eq!(sequences[0].seq(), b"ATGCATGC");
assert_eq!(sequences[1].id(), "seq2");
assert_eq!(sequences[1].seq(), b"GCTAGCTA");
}
#[test]
fn test_validate_fasta_file() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file
.write_all(b">valid_seq\nATGC\n>another_seq\nGCTA\n")
.unwrap();
assert!(validate_fasta_file(temp_file.path()).is_ok());
}
#[test]
fn test_validate_empty_file() {
let temp_file = NamedTempFile::new().unwrap();
let result = validate_fasta_file(temp_file.path());
assert!(result.is_err());
}
#[test]
fn test_count_sequences() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file
.write_all(b">seq1\nATGC\n>seq2\nGCTA\n>seq3\nATGCGAT\n")
.unwrap();
let count = count_sequences(temp_file.path()).unwrap();
assert_eq!(count, 3);
}
#[test]
fn test_total_sequence_length() {
let mut temp_file = NamedTempFile::new().unwrap();
temp_file
.write_all(b">seq1\nATGCATGC\n>seq2\nGCTAGCTA\n")
.unwrap();
let total_length = total_sequence_length(temp_file.path()).unwrap();
assert_eq!(total_length, 16); }
}