Skip to main content

genomicframe_core/formats/vcf/
reader.rs

1//! VCF (Variant Call Format) reader
2//!
3//! VCF is a text-based format for representing genetic variants.
4//! This module provides efficient, streaming-first parsing with gzip support.
5//!
6//! # Design Philosophy
7//!
8//! - **Streaming by default**: Records are never buffered unless explicitly requested
9//! - **Zero-copy parsing**: Where possible, records borrow from the input buffer
10//! - **Automatic compression detection**: Handles `.vcf` and `.vcf.gz` transparently
11//! - **Lazy evaluation**: Process millions of variants with O(1) memory
12//!
13//! # Examples
14//!
15//! ```no_run
16//! use genomicframe_core::formats::vcf::VcfReader;
17//! use genomicframe_core::core::GenomicRecordIterator;
18//!
19//! // Streaming: O(1) memory, processes one record at a time
20//! let mut reader = VcfReader::from_path("large.vcf.gz")?;
21//! let mut high_quality_count = 0;
22//!
23//! while let Some(record) = reader.next_record()? {
24//!     if record.is_pass() && record.qual.unwrap_or(0.0) > 30.0 {
25//!         high_quality_count += 1;
26//!     }
27//! }
28//!
29//! // Chunked: Process in fixed-size batches
30//! let reader = VcfReader::from_path("large.vcf.gz")?;
31//! for chunk_result in reader.chunks(10_000) {
32//!     let chunk = chunk_result?;
33//!     // Process 10k variants at a time, then discard
34//! }
35//! # Ok::<(), genomicframe_core::error::Error>(())
36//! ```
37
38use crate::core::{GenomicPosition, GenomicReader, GenomicRecordIterator};
39use crate::error::{Error, Result};
40use crate::io::Compression;
41use crate::interval::annotation::AnnotationIndex;
42use crate::formats::vcf::annotated::AnnotatedVcfRecord;
43use flate2::read::MultiGzDecoder;
44use std::fs::File;
45use std::io::{BufRead, BufReader};
46use std::path::Path;
47
48/// A single VCF record (variant)
49#[derive(Debug, Clone)]
50pub struct VcfRecord {
51    /// Chromosome
52    pub chrom: String,
53    /// Position (1-based)
54    pub pos: u64,
55    /// Variant ID (e.g., rs number)
56    pub id: String,
57    /// Reference allele
58    pub reference: String,
59    /// Alternate allele(s)
60    pub alt: Vec<String>,
61    /// Quality score
62    pub qual: Option<f64>,
63    /// Filter status
64    pub filter: String,
65    /// INFO field (key-value annotations)
66    pub info: String,
67    /// FORMAT field
68    pub format: Option<String>,
69    /// Sample genotypes
70    pub samples: Vec<String>,
71}
72
73impl VcfRecord {
74    /// Get the genomic position of this variant
75    pub fn position(&self) -> GenomicPosition {
76        GenomicPosition::new(self.chrom.clone(), self.pos)
77    }
78
79    /// Check if this variant passed all filters
80    pub fn is_pass(&self) -> bool {
81        self.filter == "PASS" || self.filter == "."
82    }
83
84    /// Check if this variant is a SNP (single nucleotide polymorphism)
85    ///
86    /// Returns true if both REF and all ALT alleles are single nucleotides.
87    pub fn is_snp(&self) -> bool {
88        // REF must be single nucleotide
89        if self.reference.len() != 1 {
90            return false;
91        }
92
93        // All ALT alleles must be single nucleotides
94        self.alt.iter().all(|alt| alt.len() == 1 && alt != ".")
95    }
96
97    /// Check if this variant is an indel (insertion/deletion)
98    ///
99    /// Returns true if REF or any ALT allele has length != 1.
100    pub fn is_indel(&self) -> bool {
101        // If REF is not single nucleotide, it's an indel
102        if self.reference.len() != 1 {
103            return true;
104        }
105
106        // If any ALT is not single nucleotide (excluding missing "."), it's an indel
107        self.alt.iter().any(|alt| alt.len() != 1 && alt != ".")
108    }
109
110    /// Check if this variant is a transition (A↔G or C↔T)
111    ///
112    /// Returns true if the variant is a SNP and all ALT alleles are transitions.
113    pub fn is_transition(&self) -> bool {
114        if !self.is_snp() {
115            return false;
116        }
117
118        let ref_base = self.reference.chars().next().unwrap();
119
120        self.alt.iter().all(|alt| {
121            if let Some(alt_base) = alt.chars().next() {
122                matches!(
123                    (ref_base, alt_base),
124                    ('A', 'G') | ('G', 'A') | ('C', 'T') | ('T', 'C')
125                )
126            } else {
127                false
128            }
129        })
130    }
131
132    /// Check if this variant is a transversion (A↔C, A↔T, G↔C, G↔T)
133    ///
134    /// Returns true if the variant is a SNP and all ALT alleles are transversions.
135    pub fn is_transversion(&self) -> bool {
136        if !self.is_snp() {
137            return false;
138        }
139
140        let ref_base = self.reference.chars().next().unwrap();
141
142        self.alt.iter().all(|alt| {
143            if let Some(alt_base) = alt.chars().next() {
144                matches!(
145                    (ref_base, alt_base),
146                    ('A', 'C') | ('A', 'T') | ('C', 'A') | ('C', 'G') | ('G', 'C') | ('G', 'T')
147                        | ('T', 'A') | ('T', 'G')
148                )
149            } else {
150                false
151            }
152        })
153    }
154}
155
156/// VCF header metadata
157#[derive(Debug, Clone, Default)]
158pub struct VcfHeader {
159    /// File format version
160    pub version: String,
161    /// Contig/chromosome definitions
162    pub contigs: Vec<String>,
163    /// Sample names
164    pub samples: Vec<String>,
165    /// INFO field definitions
166    pub info_fields: Vec<String>,
167    /// FORMAT field definitions
168    pub format_fields: Vec<String>,
169    /// Other header lines
170    pub other: Vec<String>,
171}
172
173/// VCF file reader (streaming, memory-efficient)
174///
175/// **The only way to create a VcfReader is `VcfReader::from_path()`.**
176///
177/// This is intentionally simple - there's one obvious way to read a VCF file.
178pub struct VcfReader {
179    reader: Box<dyn BufRead>,
180    header: VcfHeader,
181}
182
183/// Iterator that annotates VCF records with genomic context
184///
185/// Created by calling `VcfReader::annotate_with()`.
186pub struct AnnotatingIterator {
187    reader: VcfReader,
188    index: AnnotationIndex,
189}
190
191impl VcfReader {
192    /// Open a VCF file - handles .vcf and .vcf.gz automatically
193    ///
194    /// # Memory Behavior
195    ///
196    /// - Opens file handle (~8KB buffer)
197    /// - Parses header (~1-10KB typically)
198    /// - **Does NOT load variants into memory**
199    /// - Variants are parsed on-demand as you iterate
200    ///
201    /// # Example
202    ///
203    /// ```no_run
204    /// use genomicframe_core::formats::vcf::VcfReader;
205    /// use genomicframe_core::core::GenomicRecordIterator;
206    ///
207    /// // Works with multi-GB files using only ~10KB RAM
208    /// let mut reader = VcfReader::from_path("variants.vcf.gz")?;
209    ///
210    /// while let Some(variant) = reader.next_record()? {
211    ///     if variant.is_pass() {
212    ///         println!("{:?}", variant);
213    ///     }
214    /// }
215    /// # Ok::<(), genomicframe_core::error::Error>(())
216    /// ```
217    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
218        let path = path.as_ref();
219        let file = File::open(path)?;
220        let compression = Compression::from_path(path);
221
222        let reader: Box<dyn BufRead> = match compression {
223            Compression::Gzip | Compression::Bgzip => {
224                Box::new(BufReader::new(MultiGzDecoder::new(file)))
225            }
226            _ => Box::new(BufReader::new(file)),
227        };
228
229        let header = Self::parse_header(reader)?;
230        Ok(header)
231    }
232
233    /// Get the VCF header metadata
234    pub fn header(&self) -> &VcfHeader {
235        &self.header
236    }
237
238    /// Create an annotating iterator that enriches variants with genomic context
239    ///
240    /// This is the ergonomic way to annotate variants from BED files, GFF files, etc.
241    ///
242    /// # Example
243    ///
244    /// ```no_run
245    /// use genomicframe_core::formats::vcf::VcfReader;
246    /// use genomicframe_core::interval::annotation::AnnotationIndex;
247    ///
248    /// // Load gene annotations
249    /// let genes = AnnotationIndex::from_bed("genes.bed", |r| {
250    ///     r.name.clone().unwrap_or_default()
251    /// })?;
252    ///
253    /// // Annotate variants as they're read
254    /// let mut reader = VcfReader::from_path("variants.vcf.gz")?;
255    /// for annotated in reader.annotate_with(genes) {
256    ///     let annotated = annotated?;
257    ///     if annotated.is_annotated() {
258    ///         println!("{:?} overlaps {}", annotated.record.position(),
259    ///                  annotated.all_annotations().join(", "));
260    ///     }
261    /// }
262    /// # Ok::<(), genomicframe_core::error::Error>(())
263    /// ```
264    pub fn annotate_with(self, index: AnnotationIndex) -> AnnotatingIterator {
265        AnnotatingIterator {
266            reader: self,
267            index,
268        }
269    }
270
271    /// Internal: Parse VCF header and return reader positioned at first variant
272    fn parse_header(mut reader: Box<dyn BufRead>) -> Result<Self> {
273        let mut header = VcfHeader::default();
274        let mut line = String::new();
275
276        loop {
277            line.clear();
278            let bytes_read = reader.read_line(&mut line)?;
279            if bytes_read == 0 {
280                return Err(Error::Parse("Empty VCF file".to_string()));
281            }
282
283            if !line.starts_with('#') {
284                // Hit first variant line - done with header
285                break;
286            }
287
288            // Parse header metadata
289            if line.starts_with("##fileformat=") {
290                header.version = line.trim_start_matches("##fileformat=").trim().to_string();
291            } else if line.starts_with("##contig=") {
292                header.contigs.push(line.trim().to_string());
293            } else if line.starts_with("##INFO=") {
294                header.info_fields.push(line.trim().to_string());
295            } else if line.starts_with("##FORMAT=") {
296                header.format_fields.push(line.trim().to_string());
297            } else if line.starts_with("#CHROM") {
298                let parts: Vec<&str> = line.trim().split('\t').collect();
299                if parts.len() > 9 {
300                    header.samples = parts[9..].iter().map(|s| s.to_string()).collect();
301                }
302            } else {
303                header.other.push(line.trim().to_string());
304            }
305        }
306
307        Ok(Self { reader, header })
308    }
309
310    /// Parse a single VCF record line
311    fn parse_record(line: &str) -> Result<VcfRecord> {
312        let parts: Vec<&str> = line.split('\t').collect();
313        if parts.len() < 8 {
314            return Err(Error::Parse(format!(
315                "Invalid VCF record: expected at least 8 fields, got {}",
316                parts.len()
317            )));
318        }
319
320        let pos = parts[1]
321            .parse::<u64>()
322            .map_err(|e| Error::Parse(format!("Invalid position: {}", e)))?;
323
324        let qual = if parts[5] == "." {
325            None
326        } else {
327            Some(
328                parts[5]
329                    .parse::<f64>()
330                    .map_err(|e| Error::Parse(format!("Invalid quality score: {}", e)))?,
331            )
332        };
333
334        let alt = parts[4].split(',').map(|s| s.to_string()).collect();
335
336        Ok(VcfRecord {
337            chrom: parts[0].to_string(),
338            pos,
339            id: parts[2].to_string(),
340            reference: parts[3].to_string(),
341            alt,
342            qual,
343            filter: parts[6].to_string(),
344            info: parts[7].to_string(),
345            format: parts.get(8).map(|s| s.to_string()),
346            samples: parts[9..].iter().map(|s| s.to_string()).collect(),
347        })
348    }
349}
350
351impl GenomicRecordIterator for VcfReader {
352    type Record = VcfRecord;
353
354    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> { 
355        // TODO: Implement
356        Ok(None)
357    }
358
359    fn next_record(&mut self) -> Result<Option<Self::Record>> {
360        let mut line = String::new();
361        loop {
362            line.clear();
363            let bytes_read = self.reader.read_line(&mut line)?;
364            if bytes_read == 0 {
365                return Ok(None);
366            }
367
368            let line = line.trim();
369            if line.is_empty() || line.starts_with('#') {
370                continue;
371            }
372
373            return Ok(Some(Self::parse_record(line)?));
374        }
375    }
376}
377
378impl GenomicReader for VcfReader {
379    type Metadata = VcfHeader;
380
381    fn metadata(&self) -> &Self::Metadata {
382        &self.header
383    }
384}
385
386impl Iterator for AnnotatingIterator {
387    type Item = Result<AnnotatedVcfRecord>;
388
389    fn next(&mut self) -> Option<Self::Item> {
390        match self.reader.next_record() {
391            Ok(Some(record)) => {
392                let mut annotated = AnnotatedVcfRecord::new(record);
393                annotated.annotate_with(&self.index);
394                Some(Ok(annotated))
395            }
396            Ok(None) => None,
397            Err(e) => Some(Err(e)),
398        }
399    }
400}
401
402#[cfg(test)]
403mod tests {
404    use super::*;
405    use std::io::Write;
406    use tempfile::NamedTempFile;
407
408    #[test]
409    fn test_vcf_header_parsing() -> Result<()> {
410        let vcf_data = "##fileformat=VCFv4.2\n\
411                        ##contig=<ID=chr1,length=248956422>\n\
412                        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample1\n\
413                        chr1\t12345\t.\tA\tG\t30\tPASS\t.\tGT\t0/1\n";
414
415        // Write to temp file
416        let mut temp_file = NamedTempFile::new()?;
417        temp_file.write_all(vcf_data.as_bytes())?;
418        temp_file.flush()?;
419
420        let reader = VcfReader::from_path(temp_file.path())?;
421
422        assert_eq!(reader.header().version, "VCFv4.2");
423        assert_eq!(reader.header().samples.len(), 1);
424        assert_eq!(reader.header().samples[0], "sample1");
425
426        Ok(())
427    }
428
429    #[test]
430    fn test_vcf_record_parsing() {
431        let line = "chr1\t12345\trs123\tA\tG\t30.5\tPASS\tDP=100\tGT\t0/1";
432        let record = VcfReader::parse_record(line).unwrap();
433
434        assert_eq!(record.chrom, "chr1");
435        assert_eq!(record.pos, 12345);
436        assert_eq!(record.id, "rs123");
437        assert_eq!(record.reference, "A");
438        assert_eq!(record.alt, vec!["G"]);
439        assert_eq!(record.qual, Some(30.5));
440        assert!(record.is_pass());
441    }
442}