genomicframe_core/formats/vcf/
reader.rs1use crate::core::{GenomicPosition, GenomicReader, GenomicRecordIterator};
39use crate::error::{Error, Result};
40use crate::io::Compression;
41use crate::interval::annotation::AnnotationIndex;
42use crate::formats::vcf::annotated::AnnotatedVcfRecord;
43use flate2::read::MultiGzDecoder;
44use std::fs::File;
45use std::io::{BufRead, BufReader};
46use std::path::Path;
47
48#[derive(Debug, Clone)]
50pub struct VcfRecord {
51 pub chrom: String,
53 pub pos: u64,
55 pub id: String,
57 pub reference: String,
59 pub alt: Vec<String>,
61 pub qual: Option<f64>,
63 pub filter: String,
65 pub info: String,
67 pub format: Option<String>,
69 pub samples: Vec<String>,
71}
72
73impl VcfRecord {
74 pub fn position(&self) -> GenomicPosition {
76 GenomicPosition::new(self.chrom.clone(), self.pos)
77 }
78
79 pub fn is_pass(&self) -> bool {
81 self.filter == "PASS" || self.filter == "."
82 }
83
84 pub fn is_snp(&self) -> bool {
88 if self.reference.len() != 1 {
90 return false;
91 }
92
93 self.alt.iter().all(|alt| alt.len() == 1 && alt != ".")
95 }
96
97 pub fn is_indel(&self) -> bool {
101 if self.reference.len() != 1 {
103 return true;
104 }
105
106 self.alt.iter().any(|alt| alt.len() != 1 && alt != ".")
108 }
109
110 pub fn is_transition(&self) -> bool {
114 if !self.is_snp() {
115 return false;
116 }
117
118 let ref_base = self.reference.chars().next().unwrap();
119
120 self.alt.iter().all(|alt| {
121 if let Some(alt_base) = alt.chars().next() {
122 matches!(
123 (ref_base, alt_base),
124 ('A', 'G') | ('G', 'A') | ('C', 'T') | ('T', 'C')
125 )
126 } else {
127 false
128 }
129 })
130 }
131
132 pub fn is_transversion(&self) -> bool {
136 if !self.is_snp() {
137 return false;
138 }
139
140 let ref_base = self.reference.chars().next().unwrap();
141
142 self.alt.iter().all(|alt| {
143 if let Some(alt_base) = alt.chars().next() {
144 matches!(
145 (ref_base, alt_base),
146 ('A', 'C') | ('A', 'T') | ('C', 'A') | ('C', 'G') | ('G', 'C') | ('G', 'T')
147 | ('T', 'A') | ('T', 'G')
148 )
149 } else {
150 false
151 }
152 })
153 }
154}
155
156#[derive(Debug, Clone, Default)]
158pub struct VcfHeader {
159 pub version: String,
161 pub contigs: Vec<String>,
163 pub samples: Vec<String>,
165 pub info_fields: Vec<String>,
167 pub format_fields: Vec<String>,
169 pub other: Vec<String>,
171}
172
173pub struct VcfReader {
179 reader: Box<dyn BufRead>,
180 header: VcfHeader,
181}
182
183pub struct AnnotatingIterator {
187 reader: VcfReader,
188 index: AnnotationIndex,
189}
190
191impl VcfReader {
192 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
218 let path = path.as_ref();
219 let file = File::open(path)?;
220 let compression = Compression::from_path(path);
221
222 let reader: Box<dyn BufRead> = match compression {
223 Compression::Gzip | Compression::Bgzip => {
224 Box::new(BufReader::new(MultiGzDecoder::new(file)))
225 }
226 _ => Box::new(BufReader::new(file)),
227 };
228
229 let header = Self::parse_header(reader)?;
230 Ok(header)
231 }
232
233 pub fn header(&self) -> &VcfHeader {
235 &self.header
236 }
237
238 pub fn annotate_with(self, index: AnnotationIndex) -> AnnotatingIterator {
265 AnnotatingIterator {
266 reader: self,
267 index,
268 }
269 }
270
271 fn parse_header(mut reader: Box<dyn BufRead>) -> Result<Self> {
273 let mut header = VcfHeader::default();
274 let mut line = String::new();
275
276 loop {
277 line.clear();
278 let bytes_read = reader.read_line(&mut line)?;
279 if bytes_read == 0 {
280 return Err(Error::Parse("Empty VCF file".to_string()));
281 }
282
283 if !line.starts_with('#') {
284 break;
286 }
287
288 if line.starts_with("##fileformat=") {
290 header.version = line.trim_start_matches("##fileformat=").trim().to_string();
291 } else if line.starts_with("##contig=") {
292 header.contigs.push(line.trim().to_string());
293 } else if line.starts_with("##INFO=") {
294 header.info_fields.push(line.trim().to_string());
295 } else if line.starts_with("##FORMAT=") {
296 header.format_fields.push(line.trim().to_string());
297 } else if line.starts_with("#CHROM") {
298 let parts: Vec<&str> = line.trim().split('\t').collect();
299 if parts.len() > 9 {
300 header.samples = parts[9..].iter().map(|s| s.to_string()).collect();
301 }
302 } else {
303 header.other.push(line.trim().to_string());
304 }
305 }
306
307 Ok(Self { reader, header })
308 }
309
310 fn parse_record(line: &str) -> Result<VcfRecord> {
312 let parts: Vec<&str> = line.split('\t').collect();
313 if parts.len() < 8 {
314 return Err(Error::Parse(format!(
315 "Invalid VCF record: expected at least 8 fields, got {}",
316 parts.len()
317 )));
318 }
319
320 let pos = parts[1]
321 .parse::<u64>()
322 .map_err(|e| Error::Parse(format!("Invalid position: {}", e)))?;
323
324 let qual = if parts[5] == "." {
325 None
326 } else {
327 Some(
328 parts[5]
329 .parse::<f64>()
330 .map_err(|e| Error::Parse(format!("Invalid quality score: {}", e)))?,
331 )
332 };
333
334 let alt = parts[4].split(',').map(|s| s.to_string()).collect();
335
336 Ok(VcfRecord {
337 chrom: parts[0].to_string(),
338 pos,
339 id: parts[2].to_string(),
340 reference: parts[3].to_string(),
341 alt,
342 qual,
343 filter: parts[6].to_string(),
344 info: parts[7].to_string(),
345 format: parts.get(8).map(|s| s.to_string()),
346 samples: parts[9..].iter().map(|s| s.to_string()).collect(),
347 })
348 }
349}
350
351impl GenomicRecordIterator for VcfReader {
352 type Record = VcfRecord;
353
354 fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
355 Ok(None)
357 }
358
359 fn next_record(&mut self) -> Result<Option<Self::Record>> {
360 let mut line = String::new();
361 loop {
362 line.clear();
363 let bytes_read = self.reader.read_line(&mut line)?;
364 if bytes_read == 0 {
365 return Ok(None);
366 }
367
368 let line = line.trim();
369 if line.is_empty() || line.starts_with('#') {
370 continue;
371 }
372
373 return Ok(Some(Self::parse_record(line)?));
374 }
375 }
376}
377
378impl GenomicReader for VcfReader {
379 type Metadata = VcfHeader;
380
381 fn metadata(&self) -> &Self::Metadata {
382 &self.header
383 }
384}
385
386impl Iterator for AnnotatingIterator {
387 type Item = Result<AnnotatedVcfRecord>;
388
389 fn next(&mut self) -> Option<Self::Item> {
390 match self.reader.next_record() {
391 Ok(Some(record)) => {
392 let mut annotated = AnnotatedVcfRecord::new(record);
393 annotated.annotate_with(&self.index);
394 Some(Ok(annotated))
395 }
396 Ok(None) => None,
397 Err(e) => Some(Err(e)),
398 }
399 }
400}
401
402#[cfg(test)]
403mod tests {
404 use super::*;
405 use std::io::Write;
406 use tempfile::NamedTempFile;
407
408 #[test]
409 fn test_vcf_header_parsing() -> Result<()> {
410 let vcf_data = "##fileformat=VCFv4.2\n\
411 ##contig=<ID=chr1,length=248956422>\n\
412 #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample1\n\
413 chr1\t12345\t.\tA\tG\t30\tPASS\t.\tGT\t0/1\n";
414
415 let mut temp_file = NamedTempFile::new()?;
417 temp_file.write_all(vcf_data.as_bytes())?;
418 temp_file.flush()?;
419
420 let reader = VcfReader::from_path(temp_file.path())?;
421
422 assert_eq!(reader.header().version, "VCFv4.2");
423 assert_eq!(reader.header().samples.len(), 1);
424 assert_eq!(reader.header().samples[0], "sample1");
425
426 Ok(())
427 }
428
429 #[test]
430 fn test_vcf_record_parsing() {
431 let line = "chr1\t12345\trs123\tA\tG\t30.5\tPASS\tDP=100\tGT\t0/1";
432 let record = VcfReader::parse_record(line).unwrap();
433
434 assert_eq!(record.chrom, "chr1");
435 assert_eq!(record.pos, 12345);
436 assert_eq!(record.id, "rs123");
437 assert_eq!(record.reference, "A");
438 assert_eq!(record.alt, vec!["G"]);
439 assert_eq!(record.qual, Some(30.5));
440 assert!(record.is_pass());
441 }
442}