genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
//! FASTA format support (sequence data)
//!
//! FASTA is a text-based format for representing nucleotide or protein sequences.

use crate::core::{GenomicReader, GenomicRecordIterator};
use crate::error::Result;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

/// A single FASTA record (sequence entry)
#[derive(Debug, Clone)]
pub struct FastaRecord {
    /// Sequence identifier (header line without '>')
    pub id: String,
    /// Optional description
    pub description: Option<String>,
    /// Sequence data
    pub sequence: String,
}

impl FastaRecord {
    /// Get the full header line
    pub fn header(&self) -> String {
        match &self.description {
            Some(desc) => format!("{} {}", self.id, desc),
            None => self.id.clone(),
        }
    }

    /// Get sequence length
    pub fn len(&self) -> usize {
        self.sequence.len()
    }

    /// Check if sequence is empty
    pub fn is_empty(&self) -> bool {
        self.sequence.is_empty()
    }
}

/// FASTA file reader
pub struct FastaReader<R: BufRead> {
    reader: R,
    current_line: Option<String>,
}

impl FastaReader<BufReader<File>> {
    /// Open a FASTA file from a path
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
        let file = File::open(path)?;
        let reader = BufReader::new(file);
        Ok(Self::new(reader))
    }
}

impl<R: BufRead> FastaReader<R> {
    /// Create a new FASTA reader from a buffered reader
    pub fn new(reader: R) -> Self {
        Self {
            reader,
            current_line: None,
        }
    }

    /// Peek at the next line without consuming it
    fn peek_line(&mut self) -> Result<Option<&str>> {
        if self.current_line.is_none() {
            let mut line = String::new();
            let bytes_read = self.reader.read_line(&mut line)?;
            if bytes_read == 0 {
                return Ok(None);
            }
            self.current_line = Some(line);
        }
        Ok(self.current_line.as_deref())
    }

    /// Consume the peeked line
    fn consume_line(&mut self) -> Option<String> {
        self.current_line.take()
    }
}

impl<R: BufRead> GenomicRecordIterator for FastaReader<R> {
    type Record = FastaRecord;
    
    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> { 
        // TODO: Implement
        Ok(None)
    }

    fn next_record(&mut self) -> Result<Option<Self::Record>> {
        // Find the next header line
        loop {
            let line = match self.peek_line()? {
                Some(line) => line.to_string(),
                None => return Ok(None),
            };

            if line.starts_with('>') {
                self.consume_line();

                // Parse header
                let header = line.trim_start_matches('>').trim();
                let (id, description) = match header.split_once(' ') {
                    Some((id, desc)) => (id.to_string(), Some(desc.to_string())),
                    None => (header.to_string(), None),
                };

                // Read sequence lines until next header or EOF
                let mut sequence = String::new();
                loop {
                    match self.peek_line()? {
                        Some(line) if !line.starts_with('>') => {
                            let line = self.consume_line().unwrap();
                            sequence.push_str(line.trim());
                        }
                        _ => break,
                    }
                }

                return Ok(Some(FastaRecord {
                    id,
                    description,
                    sequence,
                }));
            } else {
                self.consume_line();
            }
        }
    }
}

impl<R: BufRead> GenomicReader for FastaReader<R> {
    type Metadata = ();

    fn metadata(&self) -> &Self::Metadata {
        &() // No metadata for FASTA
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    #[test]
    fn test_fasta_parsing() {
        let fasta_data = ">seq1 description here\n\
                         ACGTACGTACGT\n\
                         ACGTACGT\n\
                         >seq2\n\
                         GGGGCCCC\n";

        let cursor = Cursor::new(fasta_data);
        let mut reader = FastaReader::new(cursor);

        let rec1 = reader.next_record().unwrap().unwrap();
        assert_eq!(rec1.id, "seq1");
        assert_eq!(rec1.description, Some("description here".to_string()));
        assert_eq!(rec1.sequence, "ACGTACGTACGTACGTACGT");

        let rec2 = reader.next_record().unwrap().unwrap();
        assert_eq!(rec2.id, "seq2");
        assert_eq!(rec2.description, None);
        assert_eq!(rec2.sequence, "GGGGCCCC");

        assert!(reader.next_record().unwrap().is_none());
    }
}