#![allow(dead_code)]
#![allow(missing_docs)]
use crate::error::{IoError, Result};
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
#[derive(Debug, Clone, PartialEq)]
pub struct FastaRecord {
id: String,
description: Option<String>,
sequence: String,
}
impl FastaRecord {
pub fn new(id: String, sequence: String) -> Self {
Self {
id,
description: None,
sequence,
}
}
pub fn with_description(id: String, description: String, sequence: String) -> Self {
Self {
id,
description: Some(description),
sequence,
}
}
pub fn id(&self) -> &str {
&self.id
}
pub fn description(&self) -> Option<&str> {
self.description.as_deref()
}
pub fn sequence(&self) -> &str {
&self.sequence
}
pub fn len(&self) -> usize {
self.sequence.len()
}
pub fn is_empty(&self) -> bool {
self.sequence.is_empty()
}
pub fn header(&self) -> String {
match &self.description {
Some(desc) => format!("{} {}", self.id, desc),
None => self.id.clone(),
}
}
}
pub struct FastaReader {
reader: BufReader<File>,
line_buffer: String,
lookahead: Option<String>,
}
impl FastaReader {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::open(path.as_ref())
.map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
Ok(Self {
reader: BufReader::new(file),
line_buffer: String::new(),
lookahead: None,
})
}
pub fn records(&mut self) -> FastaRecordIterator<'_> {
FastaRecordIterator { reader: self }
}
fn read_record(&mut self) -> Result<Option<FastaRecord>> {
self.line_buffer.clear();
if let Some(lookahead) = self.lookahead.take() {
self.line_buffer = lookahead;
} else {
loop {
if self
.reader
.read_line(&mut self.line_buffer)
.map_err(|e| IoError::ParseError(format!("Failed to read line: {e}")))?
== 0
{
return Ok(None); }
if self.line_buffer.starts_with('>') {
break;
}
self.line_buffer.clear();
}
}
let header = self.line_buffer[1..].trim().to_string();
let (id, description) = if let Some(space_pos) = header.find(' ') {
let (id_part, desc_part) = header.split_at(space_pos);
(id_part.to_string(), Some(desc_part[1..].to_string()))
} else {
(header, None)
};
let mut sequence = String::new();
self.line_buffer.clear();
loop {
let bytes_read = self
.reader
.read_line(&mut self.line_buffer)
.map_err(|e| IoError::ParseError(format!("Failed to read line: {e}")))?;
if bytes_read == 0 || self.line_buffer.starts_with('>') {
if self.line_buffer.starts_with('>') {
self.lookahead = Some(self.line_buffer.clone());
}
break;
}
sequence.push_str(self.line_buffer.trim());
if !self.line_buffer.starts_with('>') {
self.line_buffer.clear();
}
}
Ok(Some(FastaRecord {
id,
description,
sequence,
}))
}
}
pub struct FastaRecordIterator<'a> {
reader: &'a mut FastaReader,
}
impl Iterator for FastaRecordIterator<'_> {
type Item = Result<FastaRecord>;
fn next(&mut self) -> Option<Self::Item> {
match self.reader.read_record() {
Ok(Some(record)) => Some(Ok(record)),
Ok(None) => None,
Err(e) => Some(Err(e)),
}
}
}
pub struct FastaWriter {
writer: BufWriter<File>,
line_width: usize,
}
impl FastaWriter {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::create(path.as_ref())
.map_err(|e| IoError::FileError(format!("Failed to create file: {e}")))?;
Ok(Self {
writer: BufWriter::new(file),
line_width: 80, })
}
pub fn set_line_width(&mut self, width: usize) {
self.line_width = width;
}
pub fn write_record(&mut self, record: &FastaRecord) -> Result<()> {
write!(self.writer, ">{}", record.header())
.map_err(|e| IoError::FileError(format!("Failed to write header: {e}")))?;
writeln!(self.writer)
.map_err(|e| IoError::FileError(format!("Failed to write newline: {e}")))?;
let sequence = record.sequence();
for chunk in sequence.as_bytes().chunks(self.line_width) {
self.writer
.write_all(chunk)
.map_err(|e| IoError::FileError(format!("Failed to write sequence: {e}")))?;
writeln!(self.writer)
.map_err(|e| IoError::FileError(format!("Failed to write newline: {e}")))?;
}
Ok(())
}
pub fn flush(&mut self) -> Result<()> {
self.writer
.flush()
.map_err(|e| IoError::FileError(format!("Failed to flush: {e}")))
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum QualityEncoding {
#[default]
Sanger,
Illumina,
}
#[derive(Debug, Clone, PartialEq)]
pub struct FastqRecord {
id: String,
description: Option<String>,
sequence: String,
quality: String,
}
impl FastqRecord {
pub fn new(id: String, sequence: String, quality: String) -> Result<Self> {
if sequence.len() != quality.len() {
return Err(IoError::ParseError(format!(
"Sequence and quality lengths don't match: {} vs {}",
sequence.len(),
quality.len()
)));
}
Ok(Self {
id,
description: None,
sequence,
quality,
})
}
pub fn with_description(
id: String,
description: String,
sequence: String,
quality: String,
) -> Result<Self> {
if sequence.len() != quality.len() {
return Err(IoError::ParseError(format!(
"Sequence and quality lengths don't match: {} vs {}",
sequence.len(),
quality.len()
)));
}
Ok(Self {
id,
description: Some(description),
sequence,
quality,
})
}
pub fn id(&self) -> &str {
&self.id
}
pub fn description(&self) -> Option<&str> {
self.description.as_deref()
}
pub fn sequence(&self) -> &str {
&self.sequence
}
pub fn quality(&self) -> &str {
&self.quality
}
pub fn len(&self) -> usize {
self.sequence.len()
}
pub fn is_empty(&self) -> bool {
self.sequence.is_empty()
}
pub fn quality_scores(&self, encoding: QualityEncoding) -> Vec<u8> {
let offset = match encoding {
QualityEncoding::Sanger => 33,
QualityEncoding::Illumina => 64,
};
self.quality
.bytes()
.map(|b| b.saturating_sub(offset))
.collect()
}
pub fn header(&self) -> String {
match &self.description {
Some(desc) => format!("{} {}", self.id, desc),
None => self.id.clone(),
}
}
}
pub struct FastqReader {
reader: BufReader<File>,
#[allow(dead_code)]
encoding: QualityEncoding,
line_buffer: String,
}
impl FastqReader {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::open(path.as_ref())
.map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
Ok(Self {
reader: BufReader::new(file),
encoding: QualityEncoding::default(),
line_buffer: String::new(),
})
}
pub fn open_with_encoding<P: AsRef<Path>>(path: P, encoding: QualityEncoding) -> Result<Self> {
let file = File::open(path.as_ref())
.map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
Ok(Self {
reader: BufReader::new(file),
encoding,
line_buffer: String::new(),
})
}
pub fn records(&mut self) -> FastqRecordIterator<'_> {
FastqRecordIterator { reader: self }
}
fn read_record(&mut self) -> Result<Option<FastqRecord>> {
self.line_buffer.clear();
if self
.reader
.read_line(&mut self.line_buffer)
.map_err(|e| IoError::ParseError(format!("Failed to read header: {e}")))?
== 0
{
return Ok(None); }
if !self.line_buffer.starts_with('@') {
return Err(IoError::ParseError(format!(
"Expected '@' at start of header, found: {}",
self.line_buffer.trim()
)));
}
let header = self.line_buffer[1..].trim().to_string();
let (id, description) = if let Some(space_pos) = header.find(' ') {
let (id_part, desc_part) = header.split_at(space_pos);
(id_part.to_string(), Some(desc_part[1..].to_string()))
} else {
(header, None)
};
self.line_buffer.clear();
self.reader
.read_line(&mut self.line_buffer)
.map_err(|e| IoError::ParseError(format!("Failed to read sequence: {e}")))?;
let sequence = self.line_buffer.trim().to_string();
self.line_buffer.clear();
self.reader
.read_line(&mut self.line_buffer)
.map_err(|e| IoError::ParseError(format!("Failed to read separator: {e}")))?;
if !self.line_buffer.starts_with('+') {
return Err(IoError::ParseError(format!(
"Expected '+' separator, found: {}",
self.line_buffer.trim()
)));
}
self.line_buffer.clear();
self.reader
.read_line(&mut self.line_buffer)
.map_err(|e| IoError::ParseError(format!("Failed to read quality: {e}")))?;
let quality = self.line_buffer.trim().to_string();
FastqRecord::new(id.clone(), sequence.clone(), quality.clone())
.or_else(|_| {
FastqRecord::with_description(
id,
description.unwrap_or_default(),
sequence,
quality,
)
})
.map(Some)
}
}
pub struct FastqRecordIterator<'a> {
reader: &'a mut FastqReader,
}
impl Iterator for FastqRecordIterator<'_> {
type Item = Result<FastqRecord>;
fn next(&mut self) -> Option<Self::Item> {
match self.reader.read_record() {
Ok(Some(record)) => Some(Ok(record)),
Ok(None) => None,
Err(e) => Some(Err(e)),
}
}
}
pub struct FastqWriter {
writer: BufWriter<File>,
encoding: QualityEncoding,
}
impl FastqWriter {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::create(path.as_ref())
.map_err(|e| IoError::FileError(format!("Failed to create file: {e}")))?;
Ok(Self {
writer: BufWriter::new(file),
encoding: QualityEncoding::default(),
})
}
pub fn create_with_encoding<P: AsRef<Path>>(
path: P,
encoding: QualityEncoding,
) -> Result<Self> {
let file = File::create(path.as_ref())
.map_err(|e| IoError::FileError(format!("Failed to create file: {e}")))?;
Ok(Self {
writer: BufWriter::new(file),
encoding,
})
}
pub fn write_record(&mut self, record: &FastqRecord) -> Result<()> {
writeln!(self.writer, "@{}", record.header())
.map_err(|e| IoError::FileError(format!("Failed to write header: {e}")))?;
writeln!(self.writer, "{}", record.sequence())
.map_err(|e| IoError::FileError(format!("Failed to write sequence: {e}")))?;
writeln!(self.writer, "+")
.map_err(|e| IoError::FileError(format!("Failed to write separator: {e}")))?;
writeln!(self.writer, "{}", record.quality())
.map_err(|e| IoError::FileError(format!("Failed to write quality: {e}")))?;
Ok(())
}
pub fn write_record_with_scores(
&mut self,
id: &str,
sequence: &str,
quality_scores: &[u8],
) -> Result<()> {
if sequence.len() != quality_scores.len() {
return Err(IoError::FileError(format!(
"Sequence and quality lengths don't match: {} vs {}",
sequence.len(),
quality_scores.len()
)));
}
let offset = match self.encoding {
QualityEncoding::Sanger => 33,
QualityEncoding::Illumina => 64,
};
let quality_string: String = quality_scores
.iter()
.map(|&score| (score.saturating_add(offset)) as char)
.collect();
let record = FastqRecord::new(id.to_string(), sequence.to_string(), quality_string)?;
self.write_record(&record)
}
pub fn flush(&mut self) -> Result<()> {
self.writer
.flush()
.map_err(|e| IoError::FileError(format!("Failed to flush: {e}")))
}
}
#[allow(dead_code)]
pub fn count_fastasequences<P: AsRef<Path>>(path: P) -> Result<usize> {
let file = File::open(path.as_ref())
.map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
let reader = BufReader::new(file);
let count = reader
.lines()
.map_while(|result| result.ok())
.filter(|line| line.starts_with('>'))
.count();
Ok(count)
}
#[allow(dead_code)]
pub fn count_fastqsequences<P: AsRef<Path>>(path: P) -> Result<usize> {
let file = File::open(path.as_ref())
.map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
let reader = BufReader::new(file);
let line_count = reader.lines().count();
if line_count % 4 != 0 {
return Err(IoError::ParseError(format!(
"Invalid FASTQ file: line count {line_count} is not divisible by 4"
)));
}
Ok(line_count / 4)
}
pub mod analysis {
use super::*;
use std::collections::HashMap;
pub fn gc_content(sequence: &str) -> f64 {
if sequence.is_empty() {
return 0.0;
}
let gc_count = sequence
.chars()
.filter(|&c| c == 'G' || c == 'C' || c == 'g' || c == 'c')
.count();
gc_count as f64 / sequence.len() as f64
}
pub fn nucleotide_composition(sequence: &str) -> HashMap<char, usize> {
let mut composition = HashMap::new();
for nucleotide in sequence.chars() {
*composition
.entry(nucleotide.to_ascii_uppercase())
.or_insert(0) += 1;
}
composition
}
pub fn reverse_complement(sequence: &str) -> String {
sequence
.chars()
.rev()
.map(|c| match c.to_ascii_uppercase() {
'A' => 'T',
'T' => 'A',
'G' => 'C',
'C' => 'G',
'U' => 'A', 'N' => 'N', _ => c,
})
.collect()
}
pub fn translate_dna(sequence: &str) -> String {
let codon_table = get_standard_genetic_code();
sequence
.chars()
.collect::<Vec<_>>()
.chunks(3)
.filter_map(|codon| {
if codon.len() == 3 {
let codon_str: String = codon.iter().collect::<String>().to_uppercase();
Some(codon_table.get(&codon_str).unwrap_or(&'X').to_owned())
} else {
None
}
})
.collect()
}
fn get_standard_genetic_code() -> HashMap<String, char> {
let mut code = HashMap::new();
let codons = [
("TTT", 'F'),
("TTC", 'F'),
("TTA", 'L'),
("TTG", 'L'),
("TCT", 'S'),
("TCC", 'S'),
("TCA", 'S'),
("TCG", 'S'),
("TAT", 'Y'),
("TAC", 'Y'),
("TAA", '*'),
("TAG", '*'),
("TGT", 'C'),
("TGC", 'C'),
("TGA", '*'),
("TGG", 'W'),
("CTT", 'L'),
("CTC", 'L'),
("CTA", 'L'),
("CTG", 'L'),
("CCT", 'P'),
("CCC", 'P'),
("CCA", 'P'),
("CCG", 'P'),
("CAT", 'H'),
("CAC", 'H'),
("CAA", 'Q'),
("CAG", 'Q'),
("CGT", 'R'),
("CGC", 'R'),
("CGA", 'R'),
("CGG", 'R'),
("ATT", 'I'),
("ATC", 'I'),
("ATA", 'I'),
("ATG", 'M'),
("ACT", 'T'),
("ACC", 'T'),
("ACA", 'T'),
("ACG", 'T'),
("AAT", 'N'),
("AAC", 'N'),
("AAA", 'K'),
("AAG", 'K'),
("AGT", 'S'),
("AGC", 'S'),
("AGA", 'R'),
("AGG", 'R'),
("GTT", 'V'),
("GTC", 'V'),
("GTA", 'V'),
("GTG", 'V'),
("GCT", 'A'),
("GCC", 'A'),
("GCA", 'A'),
("GCG", 'A'),
("GAT", 'D'),
("GAC", 'D'),
("GAA", 'E'),
("GAG", 'E'),
("GGT", 'G'),
("GGC", 'G'),
("GGA", 'G'),
("GGG", 'G'),
];
for (codon, amino_acid) in &codons {
code.insert(codon.to_string(), *amino_acid);
}
code
}
pub fn find_orfs(sequence: &str, minlength: usize) -> Vec<Orf> {
let mut orfs = Vec::new();
let seq_upper = sequence.to_uppercase();
for frame in 0..3 {
let frame_seq = &seq_upper[frame..];
let mut start_pos = None;
for (pos, codon) in frame_seq.chars().collect::<Vec<_>>().chunks(3).enumerate() {
if codon.len() < 3 {
break;
}
let codon_str: String = codon.iter().collect();
if codon_str == "ATG" && start_pos.is_none() {
start_pos = Some(frame + pos * 3);
}
if matches!(codon_str.as_str(), "TAA" | "TAG" | "TGA") {
if let Some(start) = start_pos {
let length = frame + pos * 3 + 3 - start;
if length >= minlength {
let orf_seq = &sequence[start..start + length];
orfs.push(Orf {
start_pos: start,
end_pos: start + length,
frame: frame as i8,
sequence: orf_seq.to_string(),
protein: translate_dna(orf_seq),
});
}
start_pos = None;
}
}
}
}
orfs
}
#[derive(Debug, Clone, PartialEq)]
pub struct Orf {
pub start_pos: usize,
pub end_pos: usize,
pub frame: i8,
pub sequence: String,
pub protein: String,
}
impl Orf {
pub fn length(&self) -> usize {
self.sequence.len()
}
}
pub fn sequence_stats(records: &[FastaRecord]) -> SequenceStats {
if records.is_empty() {
return SequenceStats::default();
}
let lengths: Vec<usize> = records.iter().map(|r| r.len()).collect();
let totallength: usize = lengths.iter().sum();
let minlength = *lengths.iter().min().expect("Operation failed");
let maxlength = *lengths.iter().max().expect("Operation failed");
let meanlength = totallength as f64 / records.len() as f64;
let mut sortedlengths = lengths.clone();
sortedlengths.sort_by(|a, b| b.cmp(a)); let mut cumulative = 0;
let half_total = totallength / 2;
let mut n50 = 0;
for length in sortedlengths {
cumulative += length;
if cumulative >= half_total {
n50 = length;
break;
}
}
let total_gc: usize = records
.iter()
.map(|r| {
r.sequence()
.chars()
.filter(|&c| c == 'G' || c == 'C' || c == 'g' || c == 'c')
.count()
})
.sum();
let gc_content = total_gc as f64 / totallength as f64;
SequenceStats {
numsequences: records.len(),
totallength,
minlength,
maxlength,
meanlength,
n50,
gc_content,
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct SequenceStats {
pub numsequences: usize,
pub totallength: usize,
pub minlength: usize,
pub maxlength: usize,
pub meanlength: f64,
pub n50: usize,
pub gc_content: f64,
}
impl Default for SequenceStats {
fn default() -> Self {
Self {
numsequences: 0,
totallength: 0,
minlength: 0,
maxlength: 0,
meanlength: 0.0,
n50: 0,
gc_content: 0.0,
}
}
}
pub fn quality_stats(records: &[FastqRecord], encoding: QualityEncoding) -> QualityStats {
if records.is_empty() {
return QualityStats::default();
}
let mut all_scores = Vec::new();
let mut position_scores: HashMap<usize, Vec<u8>> = HashMap::new();
for record in records {
let scores = record.quality_scores(encoding);
all_scores.extend_from_slice(&scores);
for (pos, &score) in scores.iter().enumerate() {
position_scores.entry(pos).or_default().push(score);
}
}
let mean_quality = all_scores.iter().sum::<u8>() as f64 / all_scores.len() as f64;
let min_quality = *all_scores.iter().min().unwrap_or(&0);
let max_quality = *all_scores.iter().max().unwrap_or(&0);
let mut per_position_mean = Vec::new();
let max_pos = position_scores.keys().max().unwrap_or(&0);
for pos in 0..=*max_pos {
if let Some(scores) = position_scores.get(&pos) {
let mean = scores.iter().sum::<u8>() as f64 / scores.len() as f64;
per_position_mean.push(mean);
} else {
per_position_mean.push(0.0);
}
}
QualityStats {
mean_quality,
min_quality,
max_quality,
per_position_mean,
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct QualityStats {
pub mean_quality: f64,
pub min_quality: u8,
pub max_quality: u8,
pub per_position_mean: Vec<f64>,
}
impl Default for QualityStats {
fn default() -> Self {
Self {
mean_quality: 0.0,
min_quality: 0,
max_quality: 0,
per_position_mean: Vec::new(),
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::NamedTempFile;
#[test]
fn test_fasta_read_write() -> Result<()> {
let temp_file = NamedTempFile::new().expect("Operation failed");
let path = temp_file.path();
{
let mut writer = FastaWriter::create(path)?;
writer.write_record(&FastaRecord::new(
"seq1".to_string(),
"ATCGATCGATCG".to_string(),
))?;
writer.write_record(&FastaRecord::with_description(
"seq2".to_string(),
"test sequence".to_string(),
"GCTAGCTAGCTA".to_string(),
))?;
writer.flush()?;
}
{
let mut reader = FastaReader::open(path)?;
let records: Vec<_> = reader.records().collect::<Result<Vec<_>>>()?;
assert_eq!(records.len(), 2);
assert_eq!(records[0].id(), "seq1");
assert_eq!(records[0].sequence(), "ATCGATCGATCG");
assert_eq!(records[1].id(), "seq2");
assert_eq!(records[1].description(), Some("test sequence"));
assert_eq!(records[1].sequence(), "GCTAGCTAGCTA");
}
Ok(())
}
#[test]
fn test_fastq_read_write() -> Result<()> {
let temp_file = NamedTempFile::new().expect("Operation failed");
let path = temp_file.path();
{
let mut writer = FastqWriter::create(path)?;
writer.write_record(&FastqRecord::new(
"read1".to_string(),
"ATCG".to_string(),
"IIII".to_string(),
)?)?;
writer.write_record_with_scores("read2", "GCTA", &[30, 35, 40, 35])?;
writer.flush()?;
}
{
let mut reader = FastqReader::open(path)?;
let records: Vec<_> = reader.records().collect::<Result<Vec<_>>>()?;
assert_eq!(records.len(), 2);
assert_eq!(records[0].id(), "read1");
assert_eq!(records[0].sequence(), "ATCG");
assert_eq!(records[0].quality(), "IIII");
assert_eq!(records[1].id(), "read2");
assert_eq!(records[1].sequence(), "GCTA");
let scores = records[1].quality_scores(QualityEncoding::Sanger);
assert_eq!(scores, vec![30, 35, 40, 35]);
}
Ok(())
}
#[test]
fn testsequence_counting() -> Result<()> {
let fasta_file = NamedTempFile::new().expect("Operation failed");
let fastq_file = NamedTempFile::new().expect("Operation failed");
{
let mut writer = FastaWriter::create(fasta_file.path())?;
for i in 0..5 {
writer.write_record(&FastaRecord::new(format!("seq{}", i), "ATCG".to_string()))?;
}
writer.flush()?;
}
{
let mut writer = FastqWriter::create(fastq_file.path())?;
for i in 0..3 {
writer.write_record(&FastqRecord::new(
format!("read{}", i),
"ATCG".to_string(),
"IIII".to_string(),
)?)?;
}
writer.flush()?;
}
assert_eq!(count_fastasequences(fasta_file.path())?, 5);
assert_eq!(count_fastqsequences(fastq_file.path())?, 3);
Ok(())
}
}