biolic 0.1.0

A modular bioinformatics toolkit in Rust for long-read sequence processing
Documentation
//! Unified file reader.
//!
//! Provides format detection, transparent gzip decompression, and a uniform
//! `RecordReader` trait that all modules consume. The sole sequence parser is
//! `noodles` (decision D2). Supports FASTQ and FASTA (both plain and gzip).
//! Unaligned BAM is added next in Milestone 1.
//!
//! Gzip is handled with the pure-Rust `flate2` crate so the default binary is
//! fully static (decision D3). bz2/xz/zstd are only available when built with
//! the `extra-compression` feature.

use std::fs::File;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;

use anyhow::{anyhow, Context, Result};
use flate2::read::MultiGzDecoder;

use crate::record::Record;

/// A streaming reader of `Record`s.
///
/// All format-specific readers implement this trait. Modules consume records
/// without needing to know the underlying format.
pub trait RecordReader {
    /// Return the next record, or `None` at EOF.
    fn next_record(&mut self) -> Result<Option<Record>>;
}

/// Detected file format.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Format {
    Fastq,
    Fasta,
    Bam,
}

/// Open a file (or stdin if path is "-") and return a boxed `RecordReader`.
///
/// Format detection is by file extension. Compression (gzip) is auto-detected
/// from the stream's magic bytes, so it works regardless of extension.
pub fn open_reader<P: AsRef<Path>>(path: P) -> Result<Box<dyn RecordReader>> {
    let path = path.as_ref();
    let format = detect_format(path)?;

    match format {
        Format::Fastq => {
            let reader = FastqReader::open(path)?;
            Ok(Box::new(reader))
        }
        Format::Fasta => {
            let reader = FastaReader::open(path)?;
            Ok(Box::new(reader))
        }
        Format::Bam => Err(anyhow!(
            "BAM reading not yet implemented (path: {}). Tracking in Milestone 1.",
            path.display()
        )),
    }
}

/// Detect format from file extension.
///
/// Recognized extensions:
/// - `.fastq`, `.fq` (+ `.gz`/`.bz2`/`.xz`/`.zst`) -> FASTQ
/// - `.fasta`, `.fa`, `.fna` (+ `.gz`) -> FASTA
/// - `.bam`, `.sam`, `.cram` -> BAM
pub fn detect_format(path: &Path) -> Result<Format> {
    let name = path
        .file_name()
        .and_then(|s| s.to_str())
        .ok_or_else(|| anyhow!("Invalid file path: {}", path.display()))?
        .to_lowercase();

    // Strip a trailing compression extension so the underlying format extension
    // is what we match against.
    let stem = strip_compression_ext(&name);

    if stem.ends_with(".fastq") || stem.ends_with(".fq") {
        Ok(Format::Fastq)
    } else if stem.ends_with(".fasta") || stem.ends_with(".fa") || stem.ends_with(".fna") {
        Ok(Format::Fasta)
    } else if stem.ends_with(".bam") || stem.ends_with(".sam") || stem.ends_with(".cram") {
        Ok(Format::Bam)
    } else {
        Err(anyhow!(
            "Could not detect format from filename: {}. \
             Supported extensions: .fastq[.gz], .fq[.gz], .fasta[.gz], .fa[.gz], .bam, .sam, .cram",
            path.display()
        ))
    }
}

/// Remove a single trailing compression suffix from a (lowercased) filename.
fn strip_compression_ext(name: &str) -> &str {
    for suffix in [".gz", ".bz2", ".xz", ".zst"] {
        if let Some(stripped) = name.strip_suffix(suffix) {
            return stripped;
        }
    }
    name
}

/// Wrap a reader, transparently decompressing if the stream begins with the
/// gzip magic bytes (0x1f 0x8b). Returns a `BufRead` suitable for noodles.
fn decompressed(reader: impl Read + 'static) -> Result<Box<dyn BufRead>> {
    let mut buf = BufReader::new(reader);
    // Peek without consuming so the decoder still sees the magic bytes.
    let is_gzip = {
        let head = buf.fill_buf().context("reading input header")?;
        head.len() >= 2 && head[0] == 0x1f && head[1] == 0x8b
    };
    if is_gzip {
        Ok(Box::new(BufReader::new(MultiGzDecoder::new(buf))))
    } else {
        Ok(Box::new(buf))
    }
}

/// FASTQ reader built on `noodles::fastq` with pure-Rust gzip support.
pub struct FastqReader {
    inner: noodles::fastq::io::Reader<Box<dyn BufRead>>,
}

impl FastqReader {
    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let file =
            File::open(path).with_context(|| format!("Failed to open file: {}", path.display()))?;
        Self::from_reader(file)
    }

    /// Open from any reader (used for stdin).
    pub fn from_reader<R: Read + 'static>(reader: R) -> Result<Self> {
        let inner = decompressed(reader)?;
        Ok(Self {
            inner: noodles::fastq::io::Reader::new(inner),
        })
    }
}

impl RecordReader for FastqReader {
    fn next_record(&mut self) -> Result<Option<Record>> {
        let mut rec = noodles::fastq::Record::default();
        let n = self
            .inner
            .read_record(&mut rec)
            .map_err(|e| anyhow!("FASTQ parse error: {}", e))?;
        if n == 0 {
            return Ok(None);
        }
        let id = rec.name().to_vec();
        let seq = rec.sequence().to_vec();
        let qual = Some(rec.quality_scores().to_vec());
        let desc = {
            let d = rec.description();
            if d.is_empty() {
                None
            } else {
                Some(d.to_vec())
            }
        };
        Ok(Some(Record::with_desc(id, seq, qual, desc)))
    }
}

/// FASTA reader built on `noodles::fasta` with pure-Rust gzip support.
///
/// FASTA carries no quality scores, so `Record::qual` is always `None`.
pub struct FastaReader {
    inner: noodles::fasta::io::Reader<Box<dyn BufRead>>,
}

impl FastaReader {
    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let file =
            File::open(path).with_context(|| format!("Failed to open file: {}", path.display()))?;
        Self::from_reader(file)
    }

    /// Open from any reader (used for stdin).
    pub fn from_reader<R: Read + 'static>(reader: R) -> Result<Self> {
        let inner = decompressed(reader)?;
        Ok(Self {
            inner: noodles::fasta::io::Reader::new(inner),
        })
    }
}

impl RecordReader for FastaReader {
    fn next_record(&mut self) -> Result<Option<Record>> {
        let mut definition = String::new();
        let n = self
            .inner
            .read_definition(&mut definition)
            .map_err(|e| anyhow!("FASTA parse error: {}", e))?;
        if n == 0 {
            return Ok(None);
        }

        let mut seq = Vec::new();
        self.inner
            .read_sequence(&mut seq)
            .map_err(|e| anyhow!("FASTA parse error: {}", e))?;

        // The definition line is ">name [optional description]". Tolerate a
        // present-or-absent leading '>' and an optional description.
        let body = definition.trim_end();
        let body = body.strip_prefix('>').unwrap_or(body);
        let (id, desc) = match body.split_once(char::is_whitespace) {
            Some((name, rest)) => (name.as_bytes().to_vec(), Some(rest.as_bytes().to_vec())),
            None => (body.as_bytes().to_vec(), None),
        };

        Ok(Some(Record::with_desc(id, seq, None, desc)))
    }
}

/// Helper: resolve "-" to stdin, otherwise open the file.
///
/// Stdin is assumed to be FASTQ (gzip auto-detected) for now; FASTA/BAM piping
/// is handled in Milestone 1.
pub fn open_input<P: AsRef<Path>>(path: P) -> Result<Box<dyn RecordReader>> {
    let path = path.as_ref();
    if path == Path::new("-") {
        let reader = FastqReader::from_reader(std::io::stdin())?;
        Ok(Box::new(reader))
    } else {
        open_reader(path)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_format_fastq() {
        assert_eq!(
            detect_format(Path::new("foo.fastq")).unwrap(),
            Format::Fastq
        );
        assert_eq!(detect_format(Path::new("foo.fq")).unwrap(), Format::Fastq);
        assert_eq!(
            detect_format(Path::new("foo.fastq.gz")).unwrap(),
            Format::Fastq
        );
        assert_eq!(
            detect_format(Path::new("foo.fq.gz")).unwrap(),
            Format::Fastq
        );
        assert_eq!(
            detect_format(Path::new("foo.fastq.bz2")).unwrap(),
            Format::Fastq
        );
    }

    #[test]
    fn test_detect_format_fasta() {
        assert_eq!(
            detect_format(Path::new("foo.fasta")).unwrap(),
            Format::Fasta
        );
        assert_eq!(detect_format(Path::new("foo.fa")).unwrap(), Format::Fasta);
        assert_eq!(detect_format(Path::new("foo.fna")).unwrap(), Format::Fasta);
        assert_eq!(
            detect_format(Path::new("foo.fa.gz")).unwrap(),
            Format::Fasta
        );
    }

    #[test]
    fn test_detect_format_bam() {
        assert_eq!(detect_format(Path::new("foo.bam")).unwrap(), Format::Bam);
        assert_eq!(detect_format(Path::new("foo.sam")).unwrap(), Format::Bam);
    }

    #[test]
    fn test_detect_format_unknown() {
        assert!(detect_format(Path::new("foo.txt")).is_err());
        assert!(detect_format(Path::new("foo")).is_err());
    }
}