bio-seq 0.14.8

Bit packed and well-typed biological sequences
Documentation
//! This example demonstrates how to save bio-seq sequences to binary files
//!
//! It encodes amino acid or DNA sequences from the first record of a fasta file
//! and saves them to a binary file along with metadata about the encoding
//!
//! Run with:
//! ```sh
//! cargo run --example seq2bin -- amino path/to/proteins.faa --out proteins.bin
//! cargo run --example seq2bin -- open proteins.bin
//! ```

#![warn(clippy::pedantic)]
use std::fs::File;
use std::io::{self, BufReader, Read, Write};
use std::path::PathBuf;

use bio_seq::prelude::*;
use clap::{Parser, Subcommand};
use noodles::fasta::Reader;
use serde::{Deserialize, Serialize};

/// We need to associate the sequence's codec with the binary data. The
/// codec parameter in `Seq<A: Codec>` is not actually recorded by serde.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
enum TaggedSeq {
    Dna(Seq<Dna>),
    Amino(Seq<Amino>),
}

#[derive(Parser)]
#[command(
    name = "seq2bin",
    about = "Read the first record of a fasta file and write its bit-packed encoding to a binary image"
)]
struct Cli {
    #[command(subcommand)]
    command: Commands,
}

#[derive(Subcommand)]
enum Commands {
    Dna {
        input: PathBuf,

        #[arg(short, long)]
        out: PathBuf,
    },
    Amino {
        input: PathBuf,

        #[arg(short, long)]
        out: PathBuf,
    },
    Open {
        input: PathBuf,
    },
}

/// Save a DNA or Amino acid sequence to a binary file
fn save_bin(seq: &TaggedSeq, path: &PathBuf) -> io::Result<()> {
    let bs: Vec<u8> = bincode::serialize(seq).unwrap();

    let mut fd = File::create(path)?;
    fd.write_all(&bs)?;

    Ok(())
}

/// Load a sequence from a binary file
fn load_bin(path: &PathBuf) -> io::Result<String> {
    let mut buf = Vec::new();
    let mut fd = File::open(path)?;

    fd.read_to_end(&mut buf)?;

    // Deserialise that raw buffer as a tagged Seq struct
    let data: TaggedSeq =
        bincode::deserialize(&buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

    Ok(match data {
        // We can handle sequences based on their encoding
        TaggedSeq::Dna(seq) => seq.to_string(),
        TaggedSeq::Amino(seq) => seq.to_string(),
    })
}

/// Read the first record of a fasta file and encode with associated codec
fn load_fasta<A: Codec>(path: &PathBuf) -> io::Result<Seq<A>> {
    let mut rdr = Reader::new(BufReader::new(File::open(path)?));

    // read first fasta record
    let record = rdr
        .records()
        .next()
        .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Empty fasta file"))
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

    // extract sequence from record and try to convert it do a `Seq`
    let seq: Seq<A> = record?.sequence().as_ref().try_into()?;

    Ok(seq)
}

fn main() -> io::Result<()> {
    let args = Cli::parse();

    match args.command {
        Commands::Dna { input, out } => {
            let seq: Seq<Dna> = load_fasta(&input)?;
            save_bin(&TaggedSeq::Dna(seq), &out)?;
        }
        Commands::Amino { input, out } => {
            let seq: Seq<Amino> = load_fasta(&input)?;
            save_bin(&TaggedSeq::Amino(seq), &out)?;
        }
        Commands::Open { input } => {
            let seq: String = load_bin(&input)?;
            println!("{seq}");
        }
    }

    Ok(())
}

// These tests were generated by LLM
#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;
    use tempfile::NamedTempFile;

    fn create_temp_fasta(sequence: &str) -> io::Result<NamedTempFile> {
        let mut file = NamedTempFile::new()?;
        writeln!(file, ">test_sequence")?;
        writeln!(file, "{}", sequence)?;
        Ok(file)
    }

    #[test]
    fn test_dna_roundtrip() -> io::Result<()> {
        // Create a temporary FASTA file with DNA sequence
        let raw_seq =
            "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCGATCGAAAAAAAAAAAAA";
        let fasta_file = create_temp_fasta(raw_seq)?;
        let bin_file = NamedTempFile::new()?;

        // Load from FASTA and save to binary
        let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?;
        save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?;

        // Load back from binary and verify
        let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;
        assert_eq!(loaded_seq, raw_seq);

        Ok(())
    }

    #[test]
    fn test_amino_roundtrip() -> io::Result<()> {
        // Create a temporary FASTA file with amino acid sequence
        let raw_seq = "NIFLCVWGGVFSRVSLCARGALSPRAPPLL*SVYTLYM*ERGDTRDISQSAHTPHI*KRENTQK";
        let fasta_file = create_temp_fasta(raw_seq)?;
        let bin_file = NamedTempFile::new()?;

        // Load from FASTA and save to binary
        let seq: Seq<Amino> = load_fasta(&fasta_file.path().to_path_buf())?;
        save_bin(&TaggedSeq::Amino(seq), &bin_file.path().to_path_buf())?;

        // Load back from binary and verify
        let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;
        assert_eq!(loaded_seq, raw_seq);

        Ok(())
    }

    #[test]
    fn test_invalid_fasta() {
        // Create a temporary FASTA file with invalid sequence
        let invalid_seq = "ACTGCTAN"; // Invalid DNA sequence
        let fasta_file = create_temp_fasta(invalid_seq).unwrap();

        // Attempt to load as DNA sequence should fail
        let result: Result<Seq<Dna>, _> = load_fasta(&fasta_file.path().to_path_buf());
        assert!(result.is_err());
    }

    #[test]
    fn test_empty_fasta() {
        // Create an empty FASTA file
        let file = NamedTempFile::new().unwrap();

        // Attempt to load should fail
        let result: Result<Seq<Dna>, _> = load_fasta(&file.path().to_path_buf());
        assert!(result.is_err());
    }

    #[test]
    fn test_corrupted_binary() {
        // Create a binary file with invalid data
        let file = NamedTempFile::new().unwrap();
        write!(file.as_file(), "not a valid binary sequence").unwrap();

        // Attempt to load should fail
        let result = load_bin(&file.path().to_path_buf());
        assert!(result.is_err());
    }

    #[test]
    fn test_large_sequence() -> io::Result<()> {
        // Create a large sequence
        let large_dna = "ATCG".repeat(1000); // 4kb sequence
        let fasta_file = create_temp_fasta(&large_dna)?;
        let bin_file = NamedTempFile::new()?;

        // Test roundtrip
        let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?;
        save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?;
        let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;

        assert_eq!(loaded_seq, large_dna);
        Ok(())
    }
}