fastars 0.1.0 - Docs.rs

//! I/O module for FASTQ reading and writing.
//!
//! This module provides efficient reading and writing of FASTQ files,
//! with support for gzip compression and parallel I/O.
//!
//! ## Submodules
//!
//! - [`reader`]: FASTQ file reading with automatic format detection
//! - [`writer`]: FASTQ file writing with optional compression
//!
//! ## Example
//!
//! ```no_run
//! use fastars::io::{FastqReader, FastqWriter, OwnedRecord, CompressionType};
//! use std::path::Path;
//!
//! // Read records from a FASTQ file
//! let mut reader = FastqReader::new(Path::new("reads.fastq.gz")).unwrap();
//! let records = reader.read_batch(1000).unwrap();
//!
//! // Write records with compression
//! let mut writer = FastqWriter::new(
//!     Path::new("output.fastq.gz"),
//!     CompressionType::ParallelGzip,
//! ).unwrap();
//! writer.write_batch(&records).unwrap();
//! ```

pub mod direct_reader;
pub mod pool;
pub mod reader;
pub mod split_writer;
pub mod writer;

pub use direct_reader::{DirectFastqReader, DirectPairedFastqReader};
pub use pool::{BatchPool, FixedBatch, ReadPool};
pub use reader::{create_stdin_reader, FastqReader, PairedFastqReader};
pub use split_writer::{PairedSplitWriter, SplitWriter};
pub use writer::{create_stdout_writer, CompressionType, FastqWriter, PairedFastqWriter, StdoutFastqWriter};

/// An owned FASTQ record with name, sequence, and quality scores.
///
/// This struct owns its data, making it safe to store in collections
/// and pass between threads.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct OwnedRecord {
    /// The read name/identifier (without the leading '@')
    pub name: Vec<u8>,
    /// The nucleotide sequence
    pub seq: Vec<u8>,
    /// The quality scores (Phred+33 encoded)
    pub qual: Vec<u8>,
}

impl OwnedRecord {
    /// Create a new OwnedRecord from the given parts.
    pub fn new(name: Vec<u8>, seq: Vec<u8>, qual: Vec<u8>) -> Self {
        Self { name, seq, qual }
    }

    /// Create a new OwnedRecord with pre-allocated capacity.
    ///
    /// This is useful for object pooling to avoid repeated allocations.
    #[inline]
    pub fn with_capacity(capacity: usize) -> Self {
        Self {
            name: Vec::with_capacity(capacity / 4), // Names are typically shorter
            seq: Vec::with_capacity(capacity),
            qual: Vec::with_capacity(capacity),
        }
    }

    /// Clear all vectors but retain their allocated capacity.
    ///
    /// This allows the record to be reused without reallocating.
    #[inline]
    pub fn clear(&mut self) {
        self.name.clear();
        self.seq.clear();
        self.qual.clear();
    }

    /// Set the record's data from slices, reusing existing capacity.
    ///
    /// This method clears the existing data and copies from the provided slices,
    /// avoiding new allocations if the existing capacity is sufficient.
    #[inline]
    pub fn set_from(&mut self, name: &[u8], seq: &[u8], qual: &[u8]) {
        self.name.clear();
        self.name.extend_from_slice(name);
        self.seq.clear();
        self.seq.extend_from_slice(seq);
        self.qual.clear();
        self.qual.extend_from_slice(qual);
    }

    /// Returns the length of the sequence.
    #[inline]
    pub fn len(&self) -> usize {
        self.seq.len()
    }

    /// Returns true if the sequence is empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.seq.is_empty()
    }

    /// Get the name as a string slice (lossy UTF-8 conversion).
    pub fn name_str(&self) -> std::borrow::Cow<'_, str> {
        String::from_utf8_lossy(&self.name)
    }

    /// Get the sequence as a string slice (lossy UTF-8 conversion).
    pub fn seq_str(&self) -> std::borrow::Cow<'_, str> {
        String::from_utf8_lossy(&self.seq)
    }

    /// Get the quality as a string slice (lossy UTF-8 conversion).
    pub fn qual_str(&self) -> std::borrow::Cow<'_, str> {
        String::from_utf8_lossy(&self.qual)
    }

    /// Returns a reference to the sequence.
    #[inline]
    pub fn seq(&self) -> &[u8] {
        &self.seq
    }

    /// Returns a reference to the quality scores.
    #[inline]
    pub fn qual(&self) -> &[u8] {
        &self.qual
    }

    /// Returns a reference to the name/identifier.
    #[inline]
    pub fn id(&self) -> &[u8] {
        &self.name
    }

    /// Append a FASTQ record to a buffer in FASTQ format.
    ///
    /// This is useful for batching writes to avoid many small writes.
    /// The format is: @name\nseq\n+\nqual\n
    #[inline]
    pub fn append_slices_to_buffer(buffer: &mut Vec<u8>, name: &[u8], seq: &[u8], qual: &[u8]) {
        buffer.push(b'@');
        buffer.extend_from_slice(name);
        buffer.push(b'\n');
        buffer.extend_from_slice(seq);
        buffer.extend_from_slice(b"\n+\n");
        buffer.extend_from_slice(qual);
        buffer.push(b'\n');
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_owned_record_creation() {
        let record = OwnedRecord::new(
            b"read1".to_vec(),
            b"ACGT".to_vec(),
            b"IIII".to_vec(),
        );
        assert_eq!(record.name, b"read1");
        assert_eq!(record.seq, b"ACGT");
        assert_eq!(record.qual, b"IIII");
    }

    #[test]
    fn test_owned_record_len() {
        let record = OwnedRecord::new(
            b"read1".to_vec(),
            b"ACGTACGT".to_vec(),
            b"IIIIIIII".to_vec(),
        );
        assert_eq!(record.len(), 8);
        assert!(!record.is_empty());
    }

    #[test]
    fn test_owned_record_empty() {
        let record = OwnedRecord::new(Vec::new(), Vec::new(), Vec::new());
        assert!(record.is_empty());
        assert_eq!(record.len(), 0);
    }

    #[test]
    fn test_owned_record_string_conversion() {
        let record = OwnedRecord::new(
            b"read1".to_vec(),
            b"ACGT".to_vec(),
            b"IIII".to_vec(),
        );
        assert_eq!(record.name_str(), "read1");
        assert_eq!(record.seq_str(), "ACGT");
        assert_eq!(record.qual_str(), "IIII");
    }

    #[test]
    fn test_owned_record_clone() {
        let record = OwnedRecord::new(
            b"read1".to_vec(),
            b"ACGT".to_vec(),
            b"IIII".to_vec(),
        );
        let cloned = record.clone();
        assert_eq!(record, cloned);
    }

    #[test]
    fn test_owned_record_with_capacity() {
        let record = OwnedRecord::with_capacity(256);
        assert!(record.is_empty());
        assert!(record.seq.capacity() >= 256);
        assert!(record.qual.capacity() >= 256);
        assert!(record.name.capacity() >= 64); // capacity/4
    }

    #[test]
    fn test_owned_record_clear() {
        let mut record = OwnedRecord::new(
            b"read1".to_vec(),
            b"ACGT".to_vec(),
            b"IIII".to_vec(),
        );

        // Remember capacities
        let name_cap = record.name.capacity();
        let seq_cap = record.seq.capacity();
        let qual_cap = record.qual.capacity();

        record.clear();

        // Data cleared but capacity preserved
        assert!(record.is_empty());
        assert!(record.name.is_empty());
        assert!(record.qual.is_empty());
        assert!(record.name.capacity() >= name_cap);
        assert!(record.seq.capacity() >= seq_cap);
        assert!(record.qual.capacity() >= qual_cap);
    }

    #[test]
    fn test_owned_record_set_from() {
        let mut record = OwnedRecord::with_capacity(256);

        record.set_from(b"read1", b"ACGTACGT", b"IIIIIIII");
        assert_eq!(record.name, b"read1");
        assert_eq!(record.seq, b"ACGTACGT");
        assert_eq!(record.qual, b"IIIIIIII");

        // Reuse with different data
        record.set_from(b"read2", b"GGGG", b"HHHH");
        assert_eq!(record.name, b"read2");
        assert_eq!(record.seq, b"GGGG");
        assert_eq!(record.qual, b"HHHH");

        // Capacity should have grown if needed
        assert!(record.seq.capacity() >= 8);
    }

    #[test]
    fn test_owned_record_set_from_preserves_capacity() {
        let mut record = OwnedRecord::with_capacity(1024);
        let initial_cap = record.seq.capacity();

        // Set smaller data
        record.set_from(b"x", b"ACGT", b"IIII");

        // Capacity should be preserved
        assert!(record.seq.capacity() >= initial_cap);
    }
}