#![warn(clippy::pedantic)]
use std::fs::File;
use std::io::{self, BufReader, Read, Write};
use std::path::PathBuf;
use bio_seq::prelude::*;
use clap::{Parser, Subcommand};
use noodles::fasta::Reader;
use serde::{Deserialize, Serialize};
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
enum TaggedSeq {
Dna(Seq<Dna>),
Amino(Seq<Amino>),
}
#[derive(Parser)]
#[command(
name = "seq2bin",
about = "Read the first record of a fasta file and write its bit-packed encoding to a binary image"
)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Dna {
input: PathBuf,
#[arg(short, long)]
out: PathBuf,
},
Amino {
input: PathBuf,
#[arg(short, long)]
out: PathBuf,
},
Open {
input: PathBuf,
},
}
fn save_bin(seq: &TaggedSeq, path: &PathBuf) -> io::Result<()> {
let bs: Vec<u8> = bincode::serialize(seq).unwrap();
let mut fd = File::create(path)?;
fd.write_all(&bs)?;
Ok(())
}
fn load_bin(path: &PathBuf) -> io::Result<String> {
let mut buf = Vec::new();
let mut fd = File::open(path)?;
fd.read_to_end(&mut buf)?;
let data: TaggedSeq =
bincode::deserialize(&buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Ok(match data {
TaggedSeq::Dna(seq) => seq.to_string(),
TaggedSeq::Amino(seq) => seq.to_string(),
})
}
fn load_fasta<A: Codec>(path: &PathBuf) -> io::Result<Seq<A>> {
let mut rdr = Reader::new(BufReader::new(File::open(path)?));
let record = rdr
.records()
.next()
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Empty fasta file"))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
let seq: Seq<A> = record?.sequence().as_ref().try_into()?;
Ok(seq)
}
fn main() -> io::Result<()> {
let args = Cli::parse();
match args.command {
Commands::Dna { input, out } => {
let seq: Seq<Dna> = load_fasta(&input)?;
save_bin(&TaggedSeq::Dna(seq), &out)?;
}
Commands::Amino { input, out } => {
let seq: Seq<Amino> = load_fasta(&input)?;
save_bin(&TaggedSeq::Amino(seq), &out)?;
}
Commands::Open { input } => {
let seq: String = load_bin(&input)?;
println!("{seq}");
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn create_temp_fasta(sequence: &str) -> io::Result<NamedTempFile> {
let mut file = NamedTempFile::new()?;
writeln!(file, ">test_sequence")?;
writeln!(file, "{}", sequence)?;
Ok(file)
}
#[test]
fn test_dna_roundtrip() -> io::Result<()> {
let raw_seq =
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCGATCGAAAAAAAAAAAAA";
let fasta_file = create_temp_fasta(raw_seq)?;
let bin_file = NamedTempFile::new()?;
let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?;
save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?;
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;
assert_eq!(loaded_seq, raw_seq);
Ok(())
}
#[test]
fn test_amino_roundtrip() -> io::Result<()> {
let raw_seq = "NIFLCVWGGVFSRVSLCARGALSPRAPPLL*SVYTLYM*ERGDTRDISQSAHTPHI*KRENTQK";
let fasta_file = create_temp_fasta(raw_seq)?;
let bin_file = NamedTempFile::new()?;
let seq: Seq<Amino> = load_fasta(&fasta_file.path().to_path_buf())?;
save_bin(&TaggedSeq::Amino(seq), &bin_file.path().to_path_buf())?;
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;
assert_eq!(loaded_seq, raw_seq);
Ok(())
}
#[test]
fn test_invalid_fasta() {
let invalid_seq = "ACTGCTAN"; let fasta_file = create_temp_fasta(invalid_seq).unwrap();
let result: Result<Seq<Dna>, _> = load_fasta(&fasta_file.path().to_path_buf());
assert!(result.is_err());
}
#[test]
fn test_empty_fasta() {
let file = NamedTempFile::new().unwrap();
let result: Result<Seq<Dna>, _> = load_fasta(&file.path().to_path_buf());
assert!(result.is_err());
}
#[test]
fn test_corrupted_binary() {
let file = NamedTempFile::new().unwrap();
write!(file.as_file(), "not a valid binary sequence").unwrap();
let result = load_bin(&file.path().to_path_buf());
assert!(result.is_err());
}
#[test]
fn test_large_sequence() -> io::Result<()> {
let large_dna = "ATCG".repeat(1000); let fasta_file = create_temp_fasta(&large_dna)?;
let bin_file = NamedTempFile::new()?;
let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?;
save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?;
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;
assert_eq!(loaded_seq, large_dna);
Ok(())
}
}