scirs2_io/formats/bioinformatics/
mod.rs

1//! Bioinformatics file format support
2//!
3//! This module provides support for common bioinformatics file formats used
4//! in genomics, proteomics, and molecular biology research.
5//!
6//! ## Supported Formats
7//!
8//! - **FASTA**: Standard format for nucleotide and protein sequences
9//! - **FASTQ**: Sequences with per-base quality scores
10//! - **SAM/BAM**: Sequence Alignment/Map format
11//! - **VCF**: Variant Call Format for genomic variations
12//!
13//! ## Examples
14//!
15//! ```rust,no_run
16//! use scirs2_io::formats::bioinformatics::{FastaReader, FastqReader};
17//!
18//! // Read FASTA file
19//! let mut reader = FastaReader::open("sequences.fasta")?;
20//! for record in reader.records() {
21//!     let record = record?;
22//!     println!(">{}", record.id());
23//!     println!("{}", record.sequence());
24//! }
25//!
26//! // Read FASTQ file
27//! let mut reader = FastqReader::open("reads.fastq")?;
28//! for record in reader.records() {
29//!     let record = record?;
30//!     println!("@{}", record.id());
31//!     println!("{}", record.sequence());
32//!     println!("+");
33//!     println!("{}", record.quality());
34//! }
35//! # Ok::<(), scirs2_io::error::IoError>(())
36//! ```
37
38#![allow(dead_code)]
39#![allow(missing_docs)]
40
41use crate::error::{IoError, Result};
42use std::fs::File;
43use std::io::{BufRead, BufReader, BufWriter, Write};
44use std::path::Path;
45
46/// FASTA sequence record
47#[derive(Debug, Clone, PartialEq)]
48pub struct FastaRecord {
49    /// Sequence identifier (header line without '>')
50    id: String,
51    /// Optional description after the ID
52    description: Option<String>,
53    /// Sequence data
54    sequence: String,
55}
56
57impl FastaRecord {
58    /// Create a new FASTA record
59    pub fn new(id: String, sequence: String) -> Self {
60        Self {
61            id,
62            description: None,
63            sequence,
64        }
65    }
66
67    /// Create a new FASTA record with description
68    pub fn with_description(id: String, description: String, sequence: String) -> Self {
69        Self {
70            id,
71            description: Some(description),
72            sequence,
73        }
74    }
75
76    /// Get the sequence ID
77    pub fn id(&self) -> &str {
78        &self.id
79    }
80
81    /// Get the optional description
82    pub fn description(&self) -> Option<&str> {
83        self.description.as_deref()
84    }
85
86    /// Get the sequence
87    pub fn sequence(&self) -> &str {
88        &self.sequence
89    }
90
91    /// Get the sequence length
92    pub fn len(&self) -> usize {
93        self.sequence.len()
94    }
95
96    /// Check if the sequence is empty
97    pub fn is_empty(&self) -> bool {
98        self.sequence.is_empty()
99    }
100
101    /// Get the full header (ID + description)
102    pub fn header(&self) -> String {
103        match &self.description {
104            Some(desc) => format!("{} {}", self.id, desc),
105            None => self.id.clone(),
106        }
107    }
108}
109
110/// FASTA file reader
111pub struct FastaReader {
112    reader: BufReader<File>,
113    line_buffer: String,
114    lookahead: Option<String>,
115}
116
117impl FastaReader {
118    /// Open a FASTA file for reading
119    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
120        let file = File::open(path.as_ref())
121            .map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
122        Ok(Self {
123            reader: BufReader::new(file),
124            line_buffer: String::new(),
125            lookahead: None,
126        })
127    }
128
129    /// Create an iterator over FASTA records
130    pub fn records(&mut self) -> FastaRecordIterator<'_> {
131        FastaRecordIterator { reader: self }
132    }
133
134    /// Read the next record
135    fn read_record(&mut self) -> Result<Option<FastaRecord>> {
136        self.line_buffer.clear();
137
138        // Check if we have a lookahead line (next header)
139        if let Some(lookahead) = self.lookahead.take() {
140            self.line_buffer = lookahead;
141        } else {
142            // Find the next header line
143            loop {
144                if self
145                    .reader
146                    .read_line(&mut self.line_buffer)
147                    .map_err(|e| IoError::ParseError(format!("Failed to read line: {e}")))?
148                    == 0
149                {
150                    return Ok(None); // EOF
151                }
152
153                if self.line_buffer.starts_with('>') {
154                    break;
155                }
156                self.line_buffer.clear();
157            }
158        }
159
160        // Parse header
161        let header = self.line_buffer[1..].trim().to_string();
162        let (id, description) = if let Some(space_pos) = header.find(' ') {
163            let (id_part, desc_part) = header.split_at(space_pos);
164            (id_part.to_string(), Some(desc_part[1..].to_string()))
165        } else {
166            (header, None)
167        };
168
169        // Read sequence lines until next header or EOF
170        let mut sequence = String::new();
171        self.line_buffer.clear();
172
173        loop {
174            let bytes_read = self
175                .reader
176                .read_line(&mut self.line_buffer)
177                .map_err(|e| IoError::ParseError(format!("Failed to read line: {e}")))?;
178
179            if bytes_read == 0 || self.line_buffer.starts_with('>') {
180                // Reached next record or EOF
181                if self.line_buffer.starts_with('>') {
182                    // Store the header line for the next iteration
183                    self.lookahead = Some(self.line_buffer.clone());
184                }
185                break;
186            }
187
188            sequence.push_str(self.line_buffer.trim());
189            if !self.line_buffer.starts_with('>') {
190                self.line_buffer.clear();
191            }
192        }
193
194        Ok(Some(FastaRecord {
195            id,
196            description,
197            sequence,
198        }))
199    }
200}
201
202/// Iterator over FASTA records
203pub struct FastaRecordIterator<'a> {
204    reader: &'a mut FastaReader,
205}
206
207impl Iterator for FastaRecordIterator<'_> {
208    type Item = Result<FastaRecord>;
209
210    fn next(&mut self) -> Option<Self::Item> {
211        match self.reader.read_record() {
212            Ok(Some(record)) => Some(Ok(record)),
213            Ok(None) => None,
214            Err(e) => Some(Err(e)),
215        }
216    }
217}
218
219/// FASTA file writer
220pub struct FastaWriter {
221    writer: BufWriter<File>,
222    line_width: usize,
223}
224
225impl FastaWriter {
226    /// Create a new FASTA file for writing
227    pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
228        let file = File::create(path.as_ref())
229            .map_err(|e| IoError::FileError(format!("Failed to create file: {e}")))?;
230        Ok(Self {
231            writer: BufWriter::new(file),
232            line_width: 80, // Standard FASTA line width
233        })
234    }
235
236    /// Set the line width for sequence wrapping
237    pub fn set_line_width(&mut self, width: usize) {
238        self.line_width = width;
239    }
240
241    /// Write a FASTA record
242    pub fn write_record(&mut self, record: &FastaRecord) -> Result<()> {
243        // Write header
244        write!(self.writer, ">{}", record.header())
245            .map_err(|e| IoError::FileError(format!("Failed to write header: {e}")))?;
246        writeln!(self.writer)
247            .map_err(|e| IoError::FileError(format!("Failed to write newline: {e}")))?;
248
249        // Write sequence with line wrapping
250        let sequence = record.sequence();
251        for chunk in sequence.as_bytes().chunks(self.line_width) {
252            self.writer
253                .write_all(chunk)
254                .map_err(|e| IoError::FileError(format!("Failed to write sequence: {e}")))?;
255            writeln!(self.writer)
256                .map_err(|e| IoError::FileError(format!("Failed to write newline: {e}")))?;
257        }
258
259        Ok(())
260    }
261
262    /// Flush the writer
263    pub fn flush(&mut self) -> Result<()> {
264        self.writer
265            .flush()
266            .map_err(|e| IoError::FileError(format!("Failed to flush: {e}")))
267    }
268}
269
270/// FASTQ quality encoding
271#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
272pub enum QualityEncoding {
273    /// Sanger/Illumina 1.8+ (Phred+33)
274    #[default]
275    Sanger,
276    /// Illumina 1.3-1.7 (Phred+64)
277    Illumina,
278}
279
280/// FASTQ sequence record
281#[derive(Debug, Clone, PartialEq)]
282pub struct FastqRecord {
283    /// Sequence identifier
284    id: String,
285    /// Optional description
286    description: Option<String>,
287    /// Sequence data
288    sequence: String,
289    /// Quality scores (ASCII encoded)
290    quality: String,
291}
292
293impl FastqRecord {
294    /// Create a new FASTQ record
295    pub fn new(id: String, sequence: String, quality: String) -> Result<Self> {
296        if sequence.len() != quality.len() {
297            return Err(IoError::ParseError(format!(
298                "Sequence and quality lengths don't match: {} vs {}",
299                sequence.len(),
300                quality.len()
301            )));
302        }
303
304        Ok(Self {
305            id,
306            description: None,
307            sequence,
308            quality,
309        })
310    }
311
312    /// Create a new FASTQ record with description
313    pub fn with_description(
314        id: String,
315        description: String,
316        sequence: String,
317        quality: String,
318    ) -> Result<Self> {
319        if sequence.len() != quality.len() {
320            return Err(IoError::ParseError(format!(
321                "Sequence and quality lengths don't match: {} vs {}",
322                sequence.len(),
323                quality.len()
324            )));
325        }
326
327        Ok(Self {
328            id,
329            description: Some(description),
330            sequence,
331            quality,
332        })
333    }
334
335    /// Get the sequence ID
336    pub fn id(&self) -> &str {
337        &self.id
338    }
339
340    /// Get the optional description
341    pub fn description(&self) -> Option<&str> {
342        self.description.as_deref()
343    }
344
345    /// Get the sequence
346    pub fn sequence(&self) -> &str {
347        &self.sequence
348    }
349
350    /// Get the quality string
351    pub fn quality(&self) -> &str {
352        &self.quality
353    }
354
355    /// Get the sequence length
356    pub fn len(&self) -> usize {
357        self.sequence.len()
358    }
359
360    /// Check if the sequence is empty
361    pub fn is_empty(&self) -> bool {
362        self.sequence.is_empty()
363    }
364
365    /// Get quality scores as numeric values
366    pub fn quality_scores(&self, encoding: QualityEncoding) -> Vec<u8> {
367        let offset = match encoding {
368            QualityEncoding::Sanger => 33,
369            QualityEncoding::Illumina => 64,
370        };
371
372        self.quality
373            .bytes()
374            .map(|b| b.saturating_sub(offset))
375            .collect()
376    }
377
378    /// Get the full header (ID + description)
379    pub fn header(&self) -> String {
380        match &self.description {
381            Some(desc) => format!("{} {}", self.id, desc),
382            None => self.id.clone(),
383        }
384    }
385}
386
387/// FASTQ file reader
388pub struct FastqReader {
389    reader: BufReader<File>,
390    #[allow(dead_code)]
391    encoding: QualityEncoding,
392    line_buffer: String,
393}
394
395impl FastqReader {
396    /// Open a FASTQ file for reading
397    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
398        let file = File::open(path.as_ref())
399            .map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
400        Ok(Self {
401            reader: BufReader::new(file),
402            encoding: QualityEncoding::default(),
403            line_buffer: String::new(),
404        })
405    }
406
407    /// Open a FASTQ file with specific quality encoding
408    pub fn open_with_encoding<P: AsRef<Path>>(path: P, encoding: QualityEncoding) -> Result<Self> {
409        let file = File::open(path.as_ref())
410            .map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
411        Ok(Self {
412            reader: BufReader::new(file),
413            encoding,
414            line_buffer: String::new(),
415        })
416    }
417
418    /// Create an iterator over FASTQ records
419    pub fn records(&mut self) -> FastqRecordIterator<'_> {
420        FastqRecordIterator { reader: self }
421    }
422
423    /// Read the next record
424    fn read_record(&mut self) -> Result<Option<FastqRecord>> {
425        // Read header line
426        self.line_buffer.clear();
427        if self
428            .reader
429            .read_line(&mut self.line_buffer)
430            .map_err(|e| IoError::ParseError(format!("Failed to read header: {e}")))?
431            == 0
432        {
433            return Ok(None); // EOF
434        }
435
436        if !self.line_buffer.starts_with('@') {
437            return Err(IoError::ParseError(format!(
438                "Expected '@' at start of header, found: {}",
439                self.line_buffer.trim()
440            )));
441        }
442
443        // Parse header
444        let header = self.line_buffer[1..].trim().to_string();
445        let (id, description) = if let Some(space_pos) = header.find(' ') {
446            let (id_part, desc_part) = header.split_at(space_pos);
447            (id_part.to_string(), Some(desc_part[1..].to_string()))
448        } else {
449            (header, None)
450        };
451
452        // Read sequence line
453        self.line_buffer.clear();
454        self.reader
455            .read_line(&mut self.line_buffer)
456            .map_err(|e| IoError::ParseError(format!("Failed to read sequence: {e}")))?;
457        let sequence = self.line_buffer.trim().to_string();
458
459        // Read separator line
460        self.line_buffer.clear();
461        self.reader
462            .read_line(&mut self.line_buffer)
463            .map_err(|e| IoError::ParseError(format!("Failed to read separator: {e}")))?;
464        if !self.line_buffer.starts_with('+') {
465            return Err(IoError::ParseError(format!(
466                "Expected '+' separator, found: {}",
467                self.line_buffer.trim()
468            )));
469        }
470
471        // Read quality line
472        self.line_buffer.clear();
473        self.reader
474            .read_line(&mut self.line_buffer)
475            .map_err(|e| IoError::ParseError(format!("Failed to read quality: {e}")))?;
476        let quality = self.line_buffer.trim().to_string();
477
478        FastqRecord::new(id.clone(), sequence.clone(), quality.clone())
479            .or_else(|_| {
480                FastqRecord::with_description(
481                    id,
482                    description.unwrap_or_default(),
483                    sequence,
484                    quality,
485                )
486            })
487            .map(Some)
488    }
489}
490
491/// Iterator over FASTQ records
492pub struct FastqRecordIterator<'a> {
493    reader: &'a mut FastqReader,
494}
495
496impl Iterator for FastqRecordIterator<'_> {
497    type Item = Result<FastqRecord>;
498
499    fn next(&mut self) -> Option<Self::Item> {
500        match self.reader.read_record() {
501            Ok(Some(record)) => Some(Ok(record)),
502            Ok(None) => None,
503            Err(e) => Some(Err(e)),
504        }
505    }
506}
507
508/// FASTQ file writer
509pub struct FastqWriter {
510    writer: BufWriter<File>,
511    encoding: QualityEncoding,
512}
513
514impl FastqWriter {
515    /// Create a new FASTQ file for writing
516    pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
517        let file = File::create(path.as_ref())
518            .map_err(|e| IoError::FileError(format!("Failed to create file: {e}")))?;
519        Ok(Self {
520            writer: BufWriter::new(file),
521            encoding: QualityEncoding::default(),
522        })
523    }
524
525    /// Create a new FASTQ file with specific quality encoding
526    pub fn create_with_encoding<P: AsRef<Path>>(
527        path: P,
528        encoding: QualityEncoding,
529    ) -> Result<Self> {
530        let file = File::create(path.as_ref())
531            .map_err(|e| IoError::FileError(format!("Failed to create file: {e}")))?;
532        Ok(Self {
533            writer: BufWriter::new(file),
534            encoding,
535        })
536    }
537
538    /// Write a FASTQ record
539    pub fn write_record(&mut self, record: &FastqRecord) -> Result<()> {
540        // Write header
541        writeln!(self.writer, "@{}", record.header())
542            .map_err(|e| IoError::FileError(format!("Failed to write header: {e}")))?;
543
544        // Write sequence
545        writeln!(self.writer, "{}", record.sequence())
546            .map_err(|e| IoError::FileError(format!("Failed to write sequence: {e}")))?;
547
548        // Write separator
549        writeln!(self.writer, "+")
550            .map_err(|e| IoError::FileError(format!("Failed to write separator: {e}")))?;
551
552        // Write quality
553        writeln!(self.writer, "{}", record.quality())
554            .map_err(|e| IoError::FileError(format!("Failed to write quality: {e}")))?;
555
556        Ok(())
557    }
558
559    /// Write a FASTQ record from numeric quality scores
560    pub fn write_record_with_scores(
561        &mut self,
562        id: &str,
563        sequence: &str,
564        quality_scores: &[u8],
565    ) -> Result<()> {
566        if sequence.len() != quality_scores.len() {
567            return Err(IoError::FileError(format!(
568                "Sequence and quality lengths don't match: {} vs {}",
569                sequence.len(),
570                quality_scores.len()
571            )));
572        }
573
574        let offset = match self.encoding {
575            QualityEncoding::Sanger => 33,
576            QualityEncoding::Illumina => 64,
577        };
578
579        let quality_string: String = quality_scores
580            .iter()
581            .map(|&score| (score.saturating_add(offset)) as char)
582            .collect();
583
584        let record = FastqRecord::new(id.to_string(), sequence.to_string(), quality_string)?;
585        self.write_record(&record)
586    }
587
588    /// Flush the writer
589    pub fn flush(&mut self) -> Result<()> {
590        self.writer
591            .flush()
592            .map_err(|e| IoError::FileError(format!("Failed to flush: {e}")))
593    }
594}
595
596/// Count sequences in a FASTA file
597#[allow(dead_code)]
598pub fn count_fastasequences<P: AsRef<Path>>(path: P) -> Result<usize> {
599    let file = File::open(path.as_ref())
600        .map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
601    let reader = BufReader::new(file);
602
603    let count = reader
604        .lines()
605        .map_while(|result| result.ok())
606        .filter(|line| line.starts_with('>'))
607        .count();
608
609    Ok(count)
610}
611
612/// Count sequences in a FASTQ file
613#[allow(dead_code)]
614pub fn count_fastqsequences<P: AsRef<Path>>(path: P) -> Result<usize> {
615    let file = File::open(path.as_ref())
616        .map_err(|_e| IoError::FileNotFound(path.as_ref().to_string_lossy().to_string()))?;
617    let reader = BufReader::new(file);
618
619    let line_count = reader.lines().count();
620    if line_count % 4 != 0 {
621        return Err(IoError::ParseError(format!(
622            "Invalid FASTQ file: line count {line_count} is not divisible by 4"
623        )));
624    }
625
626    Ok(line_count / 4)
627}
628
629/// Sequence analysis utilities
630pub mod analysis {
631    use super::*;
632    use std::collections::HashMap;
633
634    /// Calculate GC content of a DNA sequence
635    pub fn gc_content(sequence: &str) -> f64 {
636        if sequence.is_empty() {
637            return 0.0;
638        }
639
640        let gc_count = sequence
641            .chars()
642            .filter(|&c| c == 'G' || c == 'C' || c == 'g' || c == 'c')
643            .count();
644
645        gc_count as f64 / sequence.len() as f64
646    }
647
648    /// Calculate nucleotide composition
649    pub fn nucleotide_composition(sequence: &str) -> HashMap<char, usize> {
650        let mut composition = HashMap::new();
651
652        for nucleotide in sequence.chars() {
653            *composition
654                .entry(nucleotide.to_ascii_uppercase())
655                .or_insert(0) += 1;
656        }
657
658        composition
659    }
660
661    /// Reverse complement of a DNA sequence
662    pub fn reverse_complement(sequence: &str) -> String {
663        sequence
664            .chars()
665            .rev()
666            .map(|c| match c.to_ascii_uppercase() {
667                'A' => 'T',
668                'T' => 'A',
669                'G' => 'C',
670                'C' => 'G',
671                'U' => 'A', // RNA
672                'N' => 'N', // Unknown
673                _ => c,
674            })
675            .collect()
676    }
677
678    /// Translate DNA sequence to protein (single frame)
679    pub fn translate_dna(sequence: &str) -> String {
680        let codon_table = get_standard_genetic_code();
681
682        sequence
683            .chars()
684            .collect::<Vec<_>>()
685            .chunks(3)
686            .filter_map(|codon| {
687                if codon.len() == 3 {
688                    let codon_str: String = codon.iter().collect::<String>().to_uppercase();
689                    Some(codon_table.get(&codon_str).unwrap_or(&'X').to_owned())
690                } else {
691                    None
692                }
693            })
694            .collect()
695    }
696
697    /// Get standard genetic code table
698    fn get_standard_genetic_code() -> HashMap<String, char> {
699        let mut code = HashMap::new();
700
701        // Standard genetic code (NCBI translation table 1)
702        let codons = [
703            ("TTT", 'F'),
704            ("TTC", 'F'),
705            ("TTA", 'L'),
706            ("TTG", 'L'),
707            ("TCT", 'S'),
708            ("TCC", 'S'),
709            ("TCA", 'S'),
710            ("TCG", 'S'),
711            ("TAT", 'Y'),
712            ("TAC", 'Y'),
713            ("TAA", '*'),
714            ("TAG", '*'),
715            ("TGT", 'C'),
716            ("TGC", 'C'),
717            ("TGA", '*'),
718            ("TGG", 'W'),
719            ("CTT", 'L'),
720            ("CTC", 'L'),
721            ("CTA", 'L'),
722            ("CTG", 'L'),
723            ("CCT", 'P'),
724            ("CCC", 'P'),
725            ("CCA", 'P'),
726            ("CCG", 'P'),
727            ("CAT", 'H'),
728            ("CAC", 'H'),
729            ("CAA", 'Q'),
730            ("CAG", 'Q'),
731            ("CGT", 'R'),
732            ("CGC", 'R'),
733            ("CGA", 'R'),
734            ("CGG", 'R'),
735            ("ATT", 'I'),
736            ("ATC", 'I'),
737            ("ATA", 'I'),
738            ("ATG", 'M'),
739            ("ACT", 'T'),
740            ("ACC", 'T'),
741            ("ACA", 'T'),
742            ("ACG", 'T'),
743            ("AAT", 'N'),
744            ("AAC", 'N'),
745            ("AAA", 'K'),
746            ("AAG", 'K'),
747            ("AGT", 'S'),
748            ("AGC", 'S'),
749            ("AGA", 'R'),
750            ("AGG", 'R'),
751            ("GTT", 'V'),
752            ("GTC", 'V'),
753            ("GTA", 'V'),
754            ("GTG", 'V'),
755            ("GCT", 'A'),
756            ("GCC", 'A'),
757            ("GCA", 'A'),
758            ("GCG", 'A'),
759            ("GAT", 'D'),
760            ("GAC", 'D'),
761            ("GAA", 'E'),
762            ("GAG", 'E'),
763            ("GGT", 'G'),
764            ("GGC", 'G'),
765            ("GGA", 'G'),
766            ("GGG", 'G'),
767        ];
768
769        for (codon, amino_acid) in &codons {
770            code.insert(codon.to_string(), *amino_acid);
771        }
772
773        code
774    }
775
776    /// Find open reading frames (ORFs) in a DNA sequence
777    pub fn find_orfs(sequence: &str, minlength: usize) -> Vec<Orf> {
778        let mut orfs = Vec::new();
779        let seq_upper = sequence.to_uppercase();
780
781        // Check all three reading frames
782        for frame in 0..3 {
783            let frame_seq = &seq_upper[frame..];
784            let mut start_pos = None;
785
786            for (pos, codon) in frame_seq.chars().collect::<Vec<_>>().chunks(3).enumerate() {
787                if codon.len() < 3 {
788                    break;
789                }
790
791                let codon_str: String = codon.iter().collect();
792
793                // Start codon
794                if codon_str == "ATG" && start_pos.is_none() {
795                    start_pos = Some(frame + pos * 3);
796                }
797
798                // Stop codon
799                if matches!(codon_str.as_str(), "TAA" | "TAG" | "TGA") {
800                    if let Some(start) = start_pos {
801                        let length = frame + pos * 3 + 3 - start;
802                        if length >= minlength {
803                            let orf_seq = &sequence[start..start + length];
804                            orfs.push(Orf {
805                                start_pos: start,
806                                end_pos: start + length,
807                                frame: frame as i8,
808                                sequence: orf_seq.to_string(),
809                                protein: translate_dna(orf_seq),
810                            });
811                        }
812                        start_pos = None;
813                    }
814                }
815            }
816        }
817
818        orfs
819    }
820
821    /// Open Reading Frame
822    #[derive(Debug, Clone, PartialEq)]
823    pub struct Orf {
824        pub start_pos: usize,
825        pub end_pos: usize,
826        pub frame: i8,
827        pub sequence: String,
828        pub protein: String,
829    }
830
831    impl Orf {
832        pub fn length(&self) -> usize {
833            self.sequence.len()
834        }
835    }
836
837    /// Calculate basic sequence statistics
838    pub fn sequence_stats(records: &[FastaRecord]) -> SequenceStats {
839        if records.is_empty() {
840            return SequenceStats::default();
841        }
842
843        let lengths: Vec<usize> = records.iter().map(|r| r.len()).collect();
844        let totallength: usize = lengths.iter().sum();
845        let minlength = *lengths.iter().min().unwrap();
846        let maxlength = *lengths.iter().max().unwrap();
847        let meanlength = totallength as f64 / records.len() as f64;
848
849        // Calculate N50
850        let mut sortedlengths = lengths.clone();
851        sortedlengths.sort_by(|a, b| b.cmp(a)); // Sort in descending order
852        let mut cumulative = 0;
853        let half_total = totallength / 2;
854        let mut n50 = 0;
855
856        for length in sortedlengths {
857            cumulative += length;
858            if cumulative >= half_total {
859                n50 = length;
860                break;
861            }
862        }
863
864        // Calculate overall GC content
865        let total_gc: usize = records
866            .iter()
867            .map(|r| {
868                r.sequence()
869                    .chars()
870                    .filter(|&c| c == 'G' || c == 'C' || c == 'g' || c == 'c')
871                    .count()
872            })
873            .sum();
874        let gc_content = total_gc as f64 / totallength as f64;
875
876        SequenceStats {
877            numsequences: records.len(),
878            totallength,
879            minlength,
880            maxlength,
881            meanlength,
882            n50,
883            gc_content,
884        }
885    }
886
887    /// Sequence statistics
888    #[derive(Debug, Clone, PartialEq)]
889    pub struct SequenceStats {
890        pub numsequences: usize,
891        pub totallength: usize,
892        pub minlength: usize,
893        pub maxlength: usize,
894        pub meanlength: f64,
895        pub n50: usize,
896        pub gc_content: f64,
897    }
898
899    impl Default for SequenceStats {
900        fn default() -> Self {
901            Self {
902                numsequences: 0,
903                totallength: 0,
904                minlength: 0,
905                maxlength: 0,
906                meanlength: 0.0,
907                n50: 0,
908                gc_content: 0.0,
909            }
910        }
911    }
912
913    /// Quality analysis for FASTQ data
914    pub fn quality_stats(records: &[FastqRecord], encoding: QualityEncoding) -> QualityStats {
915        if records.is_empty() {
916            return QualityStats::default();
917        }
918
919        let mut all_scores = Vec::new();
920        let mut position_scores: HashMap<usize, Vec<u8>> = HashMap::new();
921
922        for record in records {
923            let scores = record.quality_scores(encoding);
924            all_scores.extend_from_slice(&scores);
925
926            for (pos, &score) in scores.iter().enumerate() {
927                position_scores.entry(pos).or_default().push(score);
928            }
929        }
930
931        let mean_quality = all_scores.iter().sum::<u8>() as f64 / all_scores.len() as f64;
932        let min_quality = *all_scores.iter().min().unwrap_or(&0);
933        let max_quality = *all_scores.iter().max().unwrap_or(&0);
934
935        // Calculate per-position mean qualities
936        let mut per_position_mean = Vec::new();
937        let max_pos = position_scores.keys().max().unwrap_or(&0);
938
939        for pos in 0..=*max_pos {
940            if let Some(scores) = position_scores.get(&pos) {
941                let mean = scores.iter().sum::<u8>() as f64 / scores.len() as f64;
942                per_position_mean.push(mean);
943            } else {
944                per_position_mean.push(0.0);
945            }
946        }
947
948        QualityStats {
949            mean_quality,
950            min_quality,
951            max_quality,
952            per_position_mean,
953        }
954    }
955
956    /// Quality statistics for FASTQ data
957    #[derive(Debug, Clone, PartialEq)]
958    pub struct QualityStats {
959        pub mean_quality: f64,
960        pub min_quality: u8,
961        pub max_quality: u8,
962        pub per_position_mean: Vec<f64>,
963    }
964
965    impl Default for QualityStats {
966        fn default() -> Self {
967            Self {
968                mean_quality: 0.0,
969                min_quality: 0,
970                max_quality: 0,
971                per_position_mean: Vec::new(),
972            }
973        }
974    }
975}
976
977#[cfg(test)]
978mod tests {
979    use super::*;
980    use tempfile::NamedTempFile;
981
982    #[test]
983    fn test_fasta_read_write() -> Result<()> {
984        let temp_file = NamedTempFile::new().unwrap();
985        let path = temp_file.path();
986
987        // Write test data
988        {
989            let mut writer = FastaWriter::create(path)?;
990            writer.write_record(&FastaRecord::new(
991                "seq1".to_string(),
992                "ATCGATCGATCG".to_string(),
993            ))?;
994            writer.write_record(&FastaRecord::with_description(
995                "seq2".to_string(),
996                "test sequence".to_string(),
997                "GCTAGCTAGCTA".to_string(),
998            ))?;
999            writer.flush()?;
1000        }
1001
1002        // Read and verify
1003        {
1004            let mut reader = FastaReader::open(path)?;
1005            let records: Vec<_> = reader.records().collect::<Result<Vec<_>>>()?;
1006
1007            assert_eq!(records.len(), 2);
1008            assert_eq!(records[0].id(), "seq1");
1009            assert_eq!(records[0].sequence(), "ATCGATCGATCG");
1010            assert_eq!(records[1].id(), "seq2");
1011            assert_eq!(records[1].description(), Some("test sequence"));
1012            assert_eq!(records[1].sequence(), "GCTAGCTAGCTA");
1013        }
1014
1015        Ok(())
1016    }
1017
1018    #[test]
1019    fn test_fastq_read_write() -> Result<()> {
1020        let temp_file = NamedTempFile::new().unwrap();
1021        let path = temp_file.path();
1022
1023        // Write test data
1024        {
1025            let mut writer = FastqWriter::create(path)?;
1026            writer.write_record(&FastqRecord::new(
1027                "read1".to_string(),
1028                "ATCG".to_string(),
1029                "IIII".to_string(),
1030            )?)?;
1031            writer.write_record_with_scores("read2", "GCTA", &[30, 35, 40, 35])?;
1032            writer.flush()?;
1033        }
1034
1035        // Read and verify
1036        {
1037            let mut reader = FastqReader::open(path)?;
1038            let records: Vec<_> = reader.records().collect::<Result<Vec<_>>>()?;
1039
1040            assert_eq!(records.len(), 2);
1041            assert_eq!(records[0].id(), "read1");
1042            assert_eq!(records[0].sequence(), "ATCG");
1043            assert_eq!(records[0].quality(), "IIII");
1044
1045            assert_eq!(records[1].id(), "read2");
1046            assert_eq!(records[1].sequence(), "GCTA");
1047            let scores = records[1].quality_scores(QualityEncoding::Sanger);
1048            assert_eq!(scores, vec![30, 35, 40, 35]);
1049        }
1050
1051        Ok(())
1052    }
1053
1054    #[test]
1055    fn testsequence_counting() -> Result<()> {
1056        let fasta_file = NamedTempFile::new().unwrap();
1057        let fastq_file = NamedTempFile::new().unwrap();
1058
1059        // Create test FASTA
1060        {
1061            let mut writer = FastaWriter::create(fasta_file.path())?;
1062            for i in 0..5 {
1063                writer.write_record(&FastaRecord::new(format!("seq{}", i), "ATCG".to_string()))?;
1064            }
1065            writer.flush()?;
1066        }
1067
1068        // Create test FASTQ
1069        {
1070            let mut writer = FastqWriter::create(fastq_file.path())?;
1071            for i in 0..3 {
1072                writer.write_record(&FastqRecord::new(
1073                    format!("read{}", i),
1074                    "ATCG".to_string(),
1075                    "IIII".to_string(),
1076                )?)?;
1077            }
1078            writer.flush()?;
1079        }
1080
1081        assert_eq!(count_fastasequences(fasta_file.path())?, 5);
1082        assert_eq!(count_fastqsequences(fastq_file.path())?, 3);
1083
1084        Ok(())
1085    }
1086}