use crate::core::{GenomicPosition, GenomicReader, GenomicRecordIterator};
use crate::error::{Error, Result};
use crate::io::Compression;
use crate::interval::annotation::AnnotationIndex;
use crate::formats::vcf::annotated::AnnotatedVcfRecord;
use flate2::read::MultiGzDecoder;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone)]
pub struct VcfRecord {
pub chrom: String,
pub pos: u64,
pub id: String,
pub reference: String,
pub alt: Vec<String>,
pub qual: Option<f64>,
pub filter: String,
pub info: String,
pub format: Option<String>,
pub samples: Vec<String>,
}
impl VcfRecord {
pub fn position(&self) -> GenomicPosition {
GenomicPosition::new(self.chrom.clone(), self.pos)
}
pub fn is_pass(&self) -> bool {
self.filter == "PASS" || self.filter == "."
}
pub fn is_snp(&self) -> bool {
if self.reference.len() != 1 {
return false;
}
self.alt.iter().all(|alt| alt.len() == 1 && alt != ".")
}
pub fn is_indel(&self) -> bool {
if self.reference.len() != 1 {
return true;
}
self.alt.iter().any(|alt| alt.len() != 1 && alt != ".")
}
pub fn is_transition(&self) -> bool {
if !self.is_snp() {
return false;
}
let ref_base = self.reference.chars().next().unwrap();
self.alt.iter().all(|alt| {
if let Some(alt_base) = alt.chars().next() {
matches!(
(ref_base, alt_base),
('A', 'G') | ('G', 'A') | ('C', 'T') | ('T', 'C')
)
} else {
false
}
})
}
pub fn is_transversion(&self) -> bool {
if !self.is_snp() {
return false;
}
let ref_base = self.reference.chars().next().unwrap();
self.alt.iter().all(|alt| {
if let Some(alt_base) = alt.chars().next() {
matches!(
(ref_base, alt_base),
('A', 'C') | ('A', 'T') | ('C', 'A') | ('C', 'G') | ('G', 'C') | ('G', 'T')
| ('T', 'A') | ('T', 'G')
)
} else {
false
}
})
}
}
#[derive(Debug, Clone, Default)]
pub struct VcfHeader {
pub version: String,
pub contigs: Vec<String>,
pub samples: Vec<String>,
pub info_fields: Vec<String>,
pub format_fields: Vec<String>,
pub other: Vec<String>,
}
pub struct VcfReader {
reader: Box<dyn BufRead>,
header: VcfHeader,
}
pub struct AnnotatingIterator {
reader: VcfReader,
index: AnnotationIndex,
}
impl VcfReader {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path)?;
let compression = Compression::from_path(path);
let reader: Box<dyn BufRead> = match compression {
Compression::Gzip | Compression::Bgzip => {
Box::new(BufReader::new(MultiGzDecoder::new(file)))
}
_ => Box::new(BufReader::new(file)),
};
let header = Self::parse_header(reader)?;
Ok(header)
}
pub fn header(&self) -> &VcfHeader {
&self.header
}
pub fn annotate_with(self, index: AnnotationIndex) -> AnnotatingIterator {
AnnotatingIterator {
reader: self,
index,
}
}
fn parse_header(mut reader: Box<dyn BufRead>) -> Result<Self> {
let mut header = VcfHeader::default();
let mut line = String::new();
loop {
line.clear();
let bytes_read = reader.read_line(&mut line)?;
if bytes_read == 0 {
return Err(Error::Parse("Empty VCF file".to_string()));
}
if !line.starts_with('#') {
break;
}
if line.starts_with("##fileformat=") {
header.version = line.trim_start_matches("##fileformat=").trim().to_string();
} else if line.starts_with("##contig=") {
header.contigs.push(line.trim().to_string());
} else if line.starts_with("##INFO=") {
header.info_fields.push(line.trim().to_string());
} else if line.starts_with("##FORMAT=") {
header.format_fields.push(line.trim().to_string());
} else if line.starts_with("#CHROM") {
let parts: Vec<&str> = line.trim().split('\t').collect();
if parts.len() > 9 {
header.samples = parts[9..].iter().map(|s| s.to_string()).collect();
}
} else {
header.other.push(line.trim().to_string());
}
}
Ok(Self { reader, header })
}
fn parse_record(line: &str) -> Result<VcfRecord> {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() < 8 {
return Err(Error::Parse(format!(
"Invalid VCF record: expected at least 8 fields, got {}",
parts.len()
)));
}
let pos = parts[1]
.parse::<u64>()
.map_err(|e| Error::Parse(format!("Invalid position: {}", e)))?;
let qual = if parts[5] == "." {
None
} else {
Some(
parts[5]
.parse::<f64>()
.map_err(|e| Error::Parse(format!("Invalid quality score: {}", e)))?,
)
};
let alt = parts[4].split(',').map(|s| s.to_string()).collect();
Ok(VcfRecord {
chrom: parts[0].to_string(),
pos,
id: parts[2].to_string(),
reference: parts[3].to_string(),
alt,
qual,
filter: parts[6].to_string(),
info: parts[7].to_string(),
format: parts.get(8).map(|s| s.to_string()),
samples: parts[9..].iter().map(|s| s.to_string()).collect(),
})
}
}
impl GenomicRecordIterator for VcfReader {
type Record = VcfRecord;
fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
Ok(None)
}
fn next_record(&mut self) -> Result<Option<Self::Record>> {
let mut line = String::new();
loop {
line.clear();
let bytes_read = self.reader.read_line(&mut line)?;
if bytes_read == 0 {
return Ok(None);
}
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
return Ok(Some(Self::parse_record(line)?));
}
}
}
impl GenomicReader for VcfReader {
type Metadata = VcfHeader;
fn metadata(&self) -> &Self::Metadata {
&self.header
}
}
impl Iterator for AnnotatingIterator {
type Item = Result<AnnotatedVcfRecord>;
fn next(&mut self) -> Option<Self::Item> {
match self.reader.next_record() {
Ok(Some(record)) => {
let mut annotated = AnnotatedVcfRecord::new(record);
annotated.annotate_with(&self.index);
Some(Ok(annotated))
}
Ok(None) => None,
Err(e) => Some(Err(e)),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_vcf_header_parsing() -> Result<()> {
let vcf_data = "##fileformat=VCFv4.2\n\
##contig=<ID=chr1,length=248956422>\n\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample1\n\
chr1\t12345\t.\tA\tG\t30\tPASS\t.\tGT\t0/1\n";
let mut temp_file = NamedTempFile::new()?;
temp_file.write_all(vcf_data.as_bytes())?;
temp_file.flush()?;
let reader = VcfReader::from_path(temp_file.path())?;
assert_eq!(reader.header().version, "VCFv4.2");
assert_eq!(reader.header().samples.len(), 1);
assert_eq!(reader.header().samples[0], "sample1");
Ok(())
}
#[test]
fn test_vcf_record_parsing() {
let line = "chr1\t12345\trs123\tA\tG\t30.5\tPASS\tDP=100\tGT\t0/1";
let record = VcfReader::parse_record(line).unwrap();
assert_eq!(record.chrom, "chr1");
assert_eq!(record.pos, 12345);
assert_eq!(record.id, "rs123");
assert_eq!(record.reference, "A");
assert_eq!(record.alt, vec!["G"]);
assert_eq!(record.qual, Some(30.5));
assert!(record.is_pass());
}
}