grumpy/
vcf.rs

1//! Module for handling VCF files
2use pyo3::prelude::*;
3use rayon::prelude::*;
4
5use std::collections::{hash_map, HashMap};
6use std::fs::File;
7use std::io::BufReader;
8use std::string::String;
9use vcf::{VCFReader, VCFRecord};
10
11use crate::common::{AltType, Evidence, VCFRow};
12
13const NULL_FILTERS: [&str; 7] = [
14    "MIN_QUAL",
15    "MIN_MQ",
16    "MIN_VDB",
17    "MIN_HQ_DP",
18    "STRAND_BIAS",
19    "STRAND_MISMATCH",
20    "SNP_SUPPORT_IN_VCF",
21];
22
23#[pyclass]
24/// Dummy struct for wrapping VCFRecord
25///
26/// Required to make a valid pyclass to use as a function argument
27#[derive(Clone)]
28pub struct VCFRecordToParse {
29    pub record: VCFRecord,
30    pub min_dp: i32,
31    pub required_fields: Vec<String>,
32    pub vcf_row_index: usize,
33}
34
35#[pyclass]
36#[derive(Clone, Debug)]
37/// Struct to hold the information from a VCF file
38pub struct VCFFile {
39    #[pyo3(get, set)]
40    /// Header of the VCF file. TODO: populate
41    pub header: Vec<String>,
42
43    #[pyo3(get, set)]
44    /// Records of the VCF file
45    pub records: Vec<VCFRow>,
46
47    #[pyo3(get, set)]
48    /// Calls from the VCF file, indexed by genome index
49    pub calls: HashMap<i64, Vec<Evidence>>,
50
51    #[pyo3(get, set)]
52    /// Minor calls from the VCF file, indexed by genome index
53    pub minor_calls: HashMap<i64, Vec<Evidence>>,
54}
55
56#[pymethods]
57impl VCFFile {
58    #[new]
59    /// Create a new VCFFile object
60    ///
61    /// # Arguments
62    /// - `filename`: String - Path to the VCF file
63    /// - `ignore_filter`: bool - Whether to ignore the filter column
64    /// - `min_dp`: i32 - Minimum depth to consider a call
65    pub fn new(filename: String, ignore_filter: bool, min_dp: i32) -> Self {
66        let file = File::open(filename).unwrap();
67        let buf = BufReader::new(file);
68        let mut reader = VCFReader::new(buf).unwrap();
69
70        let mut record = reader.empty_record();
71        let mut more_records = reader.next_record(&mut record);
72        let mut reader_records = Vec::new();
73
74        let required_fields = vec!["GT".to_string()];
75        // Read data into memory
76        while matches!(more_records, Ok(true)) {
77            reader_records.push(record.clone());
78            more_records = reader.next_record(&mut record);
79        }
80
81        // Parse records multithreaded
82        let parsed = reader_records
83            .par_iter()
84            .enumerate()
85            .map(|(idx, record)| {
86                VCFFile::parse_record(VCFRecordToParse {
87                    record: record.clone(),
88                    min_dp,
89                    required_fields: required_fields.clone(),
90                    vcf_row_index: idx,
91                })
92            })
93            .collect::<Vec<(VCFRow, Vec<Evidence>, Vec<Evidence>)>>();
94        // For ease of access, we'll store the calls in a hashmap indexed on genome index
95        let mut calls_map: HashMap<i64, Vec<Evidence>> = HashMap::new();
96        let mut minor_calls_map: HashMap<i64, Vec<Evidence>> = HashMap::new();
97        let mut records = Vec::new();
98
99        // Fetch data
100        for (record, record_calls, _record_minor_calls) in parsed.iter() {
101            let passed = record.is_filter_pass;
102            let mut record_minor_calls = _record_minor_calls.clone();
103            records.push(record.clone());
104            for call in record_calls.iter() {
105                let mut added = false;
106                if call.call_type == AltType::NULL || ignore_filter || passed {
107                    added = true;
108                }
109
110                // Check if calls are already in the map,
111                // if they are, warn the user about multiple calls at the same position and add to vector
112                // if they aren't, add to the map
113                if let hash_map::Entry::Vacant(e) = calls_map.entry(call.genome_index) {
114                    if added {
115                        e.insert(vec![call.clone()]);
116                    }
117                } else {
118                    println!(
119                        "Multiple calls at genome position {}! {:?}\n",
120                        call.genome_index,
121                        calls_map.get(&call.genome_index).unwrap()
122                    );
123                    if added {
124                        calls_map
125                            .get_mut(&call.genome_index)
126                            .unwrap()
127                            .push(call.clone());
128                    }
129                }
130
131                if !added {
132                    // Call skipped due to filter fail, so add as a minor call
133                    let mut c = call.clone();
134                    c.is_minor = true;
135                    record_minor_calls.push(c);
136                }
137            }
138
139            // Add minor calls if the filter is passed or ignored, or if fails lie within allowed filters
140            let mut valid_filters = passed || ignore_filter;
141            if !valid_filters {
142                // Not passed filter and not ignored, so check if all filters are in allowed filters
143                let allowed_filters = [
144                    "MIN_FRS".to_string(),
145                    "MIN_DP".to_string(),
146                    "MIN_GCP".to_string(),
147                    // ONT specific
148                    "MIN_IDV".to_string(),
149                    "MIN_IMF".to_string(),
150                    "OVERLAP_BETTER_VARIANT".to_string(),
151                ];
152                // Auto-ignore if no filters are present (i.e filter = "." which is fail)
153                let mut all_allowed = !record.filter.is_empty();
154                for f in record.filter.iter() {
155                    if !allowed_filters.contains(f) {
156                        all_allowed = false;
157                        break;
158                    }
159                }
160                if all_allowed {
161                    valid_filters = true;
162                }
163            }
164            if valid_filters {
165                for call in record_minor_calls.iter() {
166                    if let hash_map::Entry::Vacant(e) = minor_calls_map.entry(call.genome_index) {
167                        e.insert(vec![call.clone()]);
168                    } else {
169                        minor_calls_map
170                            .get_mut(&call.genome_index)
171                            .unwrap()
172                            .push(call.clone());
173                    }
174                }
175            }
176        }
177
178        // I hate implict returns, but appease clippy
179        VCFFile {
180            header: Vec::new(),
181            records,
182            calls: calls_map,
183            minor_calls: minor_calls_map,
184        }
185    }
186
187    #[staticmethod]
188    fn parse_record(rec: VCFRecordToParse) -> (VCFRow, Vec<Evidence>, Vec<Evidence>) {
189        let record = rec.record;
190        let min_dp = rec.min_dp;
191        let required_fields = rec.required_fields;
192        let vcf_row_index = rec.vcf_row_index;
193
194        // Parse fields of the record to pull out data
195        // For whatever reason non of these fields are strings, so we need to convert them
196        let mut alts = Vec::new(); // One string per alt
197        for alt in record.alternative.iter() {
198            alts.push(String::from_utf8_lossy(alt).to_string().to_lowercase());
199        }
200        let mut filters = Vec::new(); // String per filter
201        for filter in record.filter.iter() {
202            for f in String::from_utf8_lossy(filter).split(";") {
203                filters.push(f.to_string());
204            }
205        }
206
207        // Oddities of how this VCF library works...
208        // Format is a vector of bytes, but we need to convert it to a vector of strings
209        // Each of these strings corresponds to an item in the genotypes vector
210        let mut format: Vec<String> = Vec::new();
211        for f in record.format.iter() {
212            format.push(String::from_utf8_lossy(f).to_string());
213        }
214
215        let mut idx = 0;
216        let mut fields: HashMap<String, Vec<String>> = HashMap::new();
217        for s in record.genotype.iter() {
218            let mut item: Vec<Vec<String>> = Vec::new();
219            for i in s.iter() {
220                let mut value: Vec<String> = Vec::new();
221                for j in i.iter() {
222                    value.push(String::from_utf8_lossy(j).to_string());
223                }
224                item.push(value.clone());
225                fields.insert(format[idx].clone(), value.clone());
226                idx += 1;
227            }
228        }
229
230        // Validate that this record has the required fields
231        for field in required_fields.iter() {
232            if !format.contains(&field.to_string()) {
233                panic!("Required field {} not found in record", field);
234            }
235        }
236
237        // Enforce that this record has a COV field
238        if !fields.contains_key("COV") {
239            let mut _cov_tag = "";
240            let mut cov = Vec::new();
241            if fields.contains_key("AD") {
242                _cov_tag = "AD";
243            } else if fields.contains_key("RO") && fields.contains_key("AO") {
244                // Annoying edge case where no single COV field exists but can be constructed
245                _cov_tag = "RO";
246            } else {
247                panic!("No COV tag found in record");
248            }
249            if _cov_tag != "RO" {
250                for c in fields.get(_cov_tag).unwrap() {
251                    cov.push(c.to_string());
252                }
253            } else {
254                let ro = fields.get("RO").unwrap()[0].clone();
255                let ao = fields.get("AO").unwrap()[0].clone();
256                cov.push(ro.to_string());
257                cov.push(ao.to_string());
258            }
259
260            fields.insert("COV".to_string(), cov);
261        }
262
263        // Check if this record has passed the filters
264        let mut passed = false;
265        if filters == vec!["PASS"] {
266            passed = true;
267        }
268
269        let is_complex = record.reference.len() > 1000 && alts.len() > 1;
270
271        let row = VCFRow {
272            position: record.position as i64,
273            reference: String::from_utf8_lossy(&record.reference)
274                .to_string()
275                .to_lowercase(),
276            alternative: alts.clone(),
277            filter: filters.clone(),
278            fields: fields.clone(),
279            is_filter_pass: passed,
280            is_complex,
281        };
282
283        let (record_calls, record_minor_calls) =
284            VCFFile::parse_record_for_calls(row.clone(), min_dp, vcf_row_index);
285
286        // if record.position == 178452 {
287        //     println!("{:?}\t{:?}\t{:?}\t{:?}\t{:?}", record.position, String::from_utf8_lossy(&record.reference), alts, filters, fields);
288        //     for call in record_calls.iter(){
289        //         println!("{:?}\n", call);
290        //     }
291        //     println!("--");
292        //     for call in record_minor_calls.iter(){
293        //         println!("{:?}\n", call);
294        //     }
295        //     println!("\n\n");
296        // }
297
298        (row, record_calls, record_minor_calls)
299    }
300
301    #[staticmethod]
302    /// Parse a record from a VCF file to get the calls
303    ///
304    /// # Arguments
305    /// - `record`: VCFRow - Record to parse
306    /// - `min_dp`: i32 - Minimum depth to consider a call
307    ///
308    /// # Returns
309    /// Tuple of:
310    /// - `calls`: Vec of Evidence - Calls from the record
311    /// - `minor_calls`: Vec of Evidence - Minor calls from the record
312    pub fn parse_record_for_calls(
313        record: VCFRow,
314        min_dp: i32,
315        vcf_row_index: usize,
316    ) -> (Vec<Evidence>, Vec<Evidence>) {
317        let mut calls: Vec<Evidence> = Vec::new();
318        let mut minor_calls: Vec<Evidence> = Vec::new();
319
320        // Dealing with the actual call here; should spit out possible minor calls afterwards
321
322        // Convert ugly genotype into a list of strings for each allele
323        let genotype = &(record.fields.get("GT").unwrap())[0]
324            .split("/")
325            .collect::<Vec<&str>>();
326        let mut cov = Vec::new();
327        for c in record.fields.get("COV").unwrap() {
328            cov.push(c.parse::<i32>().unwrap());
329        }
330
331        let mut dp = 0;
332        // As DP isn't always reliable, default to the sum of the COV field
333        cov.iter().for_each(|x| dp += x);
334
335        let ref_allele = record.reference.clone().to_lowercase();
336        let mut alt_allele = "".to_string();
337
338        let mut call_type = AltType::NULL;
339
340        // Check the filters to see if we need to override the parsing to a null call
341        for filter in record.filter.iter() {
342            // Doesn't matter what else is happening in this record if the filter
343            // is a filter we want to null
344            if NULL_FILTERS.contains(&filter.as_str()) {
345                calls.push(Evidence {
346                    cov: Some(0),
347                    frs: Some(ordered_float::OrderedFloat(0.0)),
348                    genotype: genotype.join("/"),
349                    call_type: AltType::NULL,
350                    vcf_row: vcf_row_index,
351                    reference: ref_allele.chars().nth(0).unwrap().to_string(),
352                    alt: "x".to_string(),
353                    genome_index: record.position,
354                    is_minor: false,
355                    vcf_idx: None,
356                });
357                return (calls, minor_calls);
358            }
359        }
360
361        // println!(
362        //     "{:?}\t{:?}\t{:?}\t{:?}\t{:?}",
363        //     record.position, record.reference, record.alternative, record.filter, record.fields
364        // );
365        let first = genotype[0];
366        // Adjust for 1-indexed VCF
367        let mut alt_idx = 0;
368        if first != "." {
369            alt_idx = first.parse::<i32>().unwrap() - 1;
370        }
371        if cov.len() == 1 {
372            // Just 1 item in the call so santiy check if it's a null call
373            let mut vcf_idx = None;
374            if genotype == &vec!["0", "0"] && cov[0] >= min_dp {
375                // Ref call
376                call_type = AltType::REF;
377                alt_allele = ref_allele.clone();
378                vcf_idx = Some(0);
379            } else if genotype[0] != genotype[1] && cov[0] >= min_dp {
380                // Het call
381                call_type = AltType::HET;
382                alt_allele = "z".to_string();
383            } else {
384                // Null call
385                call_type = AltType::NULL;
386                alt_allele = "x".to_string();
387            }
388            calls.push(Evidence {
389                cov: Some(cov[0_usize]),
390                frs: Some(ordered_float::OrderedFloat(1.0)),
391                genotype: genotype.join("/"),
392                call_type,
393                vcf_row: vcf_row_index,
394                reference: ref_allele.chars().nth(0).unwrap().to_string(),
395                alt: alt_allele.clone(),
396                genome_index: record.position,
397                is_minor: false,
398                vcf_idx,
399            });
400            return (calls, minor_calls);
401        }
402
403        for gt in genotype.iter().skip(1) {
404            if *gt != first {
405                call_type = AltType::HET;
406                alt_allele = "z".to_string();
407                break;
408            }
409        }
410        if call_type != AltType::HET {
411            if first == "0" {
412                call_type = AltType::REF;
413                alt_allele = ref_allele.clone();
414            } else {
415                // Placeholder to denote we have an actual call at this point
416                // Could actually be an indel instead but that can't be inferred from just the genotype
417                call_type = AltType::SNP;
418            }
419        }
420        if *genotype == vec![".", "."] {
421            call_type = AltType::NULL;
422        }
423        if call_type == AltType::NULL {
424            alt_allele = "x".to_string();
425        }
426
427        if call_type == AltType::SNP {
428            // Parse the corresponding alternate allele to get the call(s)
429            alt_allele = record.alternative[alt_idx as usize].clone().to_lowercase();
430            let call = VCFFile::simplify_call(ref_allele.clone(), alt_allele.clone());
431            let call_cov = cov[(alt_idx + 1) as usize]; // COV should be [ref, alt1, alt2..]
432            for (offset, __alt_type, __base) in call {
433                let mut _alt_type = __alt_type;
434                let mut _base = __base;
435                if call_cov < min_dp {
436                    // Override calls with null if the coverage is too low
437                    _alt_type = AltType::NULL;
438                    _base = "x".to_string();
439                }
440                let mut reference = "".to_string();
441                if offset >= 0 {
442                    reference = ref_allele.chars().nth(offset as usize).unwrap().to_string();
443                }
444                calls.push(Evidence {
445                    cov: Some(call_cov),
446                    frs: Some(ordered_float::OrderedFloat(call_cov as f32 / dp as f32)),
447                    genotype: genotype.join("/"),
448                    call_type: _alt_type,
449                    vcf_row: vcf_row_index,
450                    reference,
451                    alt: _base,
452                    genome_index: record.position + offset,
453                    is_minor: false,
454                    vcf_idx: Some((alt_idx + 1) as i64),
455                });
456            }
457        } else {
458            let mut call_cov = None;
459            let mut frs = None;
460            let mut vcf_idx = None;
461            if call_type != AltType::HET {
462                if genotype == &vec![".", "."] {
463                    // Explicit null GT can't point to a coverage so null
464                    call_type = AltType::NULL;
465                    alt_allele = "x".to_string();
466                } else {
467                    call_cov = Some(cov[(alt_idx + 1) as usize]); // COV should be [ref, alt1, alt2..]
468                    if call_cov.unwrap() < min_dp {
469                        // Override calls with null if the coverage is too low
470                        call_type = AltType::NULL;
471                        alt_allele = "x".to_string();
472                    }
473                    frs = Some(ordered_float::OrderedFloat(
474                        call_cov.unwrap() as f32 / dp as f32,
475                    ));
476                    vcf_idx = Some((alt_idx + 1) as i64);
477                }
478            }
479            for (offset, r) in ref_allele.chars().enumerate() {
480                calls.push(Evidence {
481                    cov: call_cov,
482                    frs,
483                    genotype: genotype.join("/"),
484                    call_type: call_type.clone(),
485                    vcf_row: vcf_row_index,
486                    reference: r.to_string(),
487                    alt: alt_allele.clone(),
488                    genome_index: record.position + offset as i64,
489                    is_minor: false,
490                    vcf_idx,
491                });
492            }
493        }
494
495        if call_type == AltType::NULL {
496            // Don't check for minor calls if the call is null
497            return (calls, minor_calls);
498        }
499
500        if call_type == AltType::HET {
501            // HET calls need to make sure we aren't skipping the first alt
502            alt_idx = -1;
503        }
504
505        // Now we've got the main call, we need to figure out the minor calls
506        // So check all possible values of COV for threshold to be considered a minor call
507        let mut idx = 0;
508        for coverage in cov.iter() {
509            if idx == (alt_idx + 1) as usize || idx == 0 {
510                // Skip ref and the alt we've already called
511                idx += 1;
512                continue;
513            }
514            if *coverage >= min_dp {
515                alt_allele = record.alternative[idx - 1].clone().to_lowercase();
516                let call = VCFFile::simplify_call(ref_allele.clone(), alt_allele.clone());
517                let call_cov = *coverage;
518                for (offset, alt_type, base) in call {
519                    let mut reference = "".to_string();
520                    if offset >= 0 {
521                        reference = ref_allele.chars().nth(offset as usize).unwrap().to_string();
522                    }
523                    minor_calls.push(Evidence {
524                        cov: Some(call_cov),
525                        frs: Some(ordered_float::OrderedFloat(call_cov as f32 / dp as f32)),
526                        genotype: genotype.join("/"), // This is a minor call so the row's genotype is the same
527                        call_type: alt_type,
528                        vcf_row: vcf_row_index,
529                        reference,
530                        alt: base,
531                        genome_index: record.position + offset,
532                        is_minor: true,
533                        vcf_idx: Some(idx as i64),
534                    });
535                }
536            }
537            idx += 1;
538        }
539
540        // I hate implict returns, but appease clippy
541        (calls, minor_calls)
542    }
543
544    #[staticmethod]
545    /// Simplify a call into a list of SNPs, INSs and DELs
546    ///
547    /// Some rows have long ref/alt pairs which need to be decomposed into constituent calls
548    ///
549    /// # Arguments
550    /// - `reference`: String - Reference sequence
551    /// - `alternate`: String - Alternate sequence
552    ///
553    /// # Returns
554    /// `Vec of (usize, AltType, String)` - Vec where each element is a 3-tuple of:
555    ///    - `usize`: Offset of the call from row's genome index
556    ///    - `AltType`: Type of call
557    ///    - `String`: Base(s) of the call. If SNP/HET/NULL this will be a single base. If INS/DEL this will be the sequence inserted/deleted
558    pub fn simplify_call(reference: String, alternate: String) -> Vec<(i64, AltType, String)> {
559        // Note that the offset of a call can be negative in annoying edge cases
560        // e.g insertion of "gg" --> "cagg" is a -1 offset as "ca" is inserted before the reference
561        let mut calls: Vec<(i64, AltType, String)> = Vec::new();
562        if reference.len() == alternate.len() {
563            // Simple set of SNPs
564            for i in 0..reference.len() {
565                if reference.chars().nth(i).unwrap() != alternate.chars().nth(i).unwrap() {
566                    calls.push((
567                        i as i64,
568                        AltType::SNP,
569                        alternate.chars().nth(i).unwrap().to_string(),
570                    ));
571                }
572            }
573            return calls;
574        }
575
576        /*
577        The process for finding the positions for indels are almost identical
578        as the process for finding a del can be interpreted as finding an ins with ref
579            and alt reversed.
580        The approach here is to use a sliding window to find the position of the indel
581            where the number of SNPs is lowest.
582        Assumes that there is only a single indel between the ref and the alt - there
583            may be cases where this does not work these will just produce large amounts
584            of SNPs... Could be adapted to check for multi-indel but this will scale
585            exponentially the number of versions checked.
586         */
587        let mut _x: String = "".to_string();
588        let mut _y: String = "".to_string();
589        let length = (reference.len() as i64 - alternate.len() as i64).abs();
590        let mut _indel_type = AltType::NULL;
591        // Figure out which way around to approach this
592        if reference.len() > alternate.len() {
593            _y = reference.clone();
594            _x = alternate.clone();
595            _indel_type = AltType::DEL;
596        } else {
597            _y = alternate.clone();
598            _x = reference.clone();
599            _indel_type = AltType::INS;
600        }
601
602        let padding = "N".repeat(length as usize);
603        let mut current = "".to_string();
604        let mut current_dist = i64::MAX;
605        let mut indel_start: i64 = 0;
606        for i in 0.._x.len() + 1 {
607            let x1 = _x[0..i].to_string() + &padding + &_x[i.._x.len()];
608            // println!("{:?}\t{:?}\t{:?}\t{:?}\t{:?}", reference, alternate, x, y, x1);
609            let dist = snp_dist(&x1, &_y);
610            if dist <= current_dist {
611                current = x1.clone();
612                current_dist = dist;
613                indel_start = i as i64;
614            }
615        }
616
617        if _indel_type == AltType::INS {
618            // Ins after, del at, so adjust
619            indel_start -= 1;
620        }
621
622        let mut indel_str = "".to_string();
623        for i in 0..current.len() {
624            if current.chars().nth(i).unwrap() == 'N' {
625                indel_str += &_y.chars().nth(i).unwrap().to_string();
626            }
627        }
628
629        calls.push((indel_start, _indel_type.clone(), indel_str));
630
631        if _indel_type == AltType::INS {
632            // Return to vector index now the call has been constructed
633            indel_start += 1;
634        }
635        let mut indel_left = length;
636        let mut offset = 0;
637        for i in 0.._y.len() {
638            if i as i64 == indel_start {
639                // Pad where the indel is to ensure we get all calls
640                if indel_left > 0 {
641                    indel_left -= 1;
642                    indel_start += 1;
643                    offset += 1;
644                }
645                continue;
646            }
647            let r = _y.chars().nth(i).unwrap();
648            let a = _x.chars().nth(i - offset).unwrap();
649            if r != 'N' && a != 'N' && r != a {
650                if _indel_type == AltType::DEL {
651                    if indel_left > 0 {
652                        // We have some deletion ahead of this positon, so it doesn't need to be adjusted
653                        calls.push(((i - offset) as i64, AltType::SNP, a.to_string()));
654                    } else {
655                        // We've passed the deletion, so adjust the offset to account for the deletion
656                        calls.push((
657                            (i - offset + length as usize) as i64,
658                            AltType::SNP,
659                            a.to_string(),
660                        ));
661                    }
662                } else {
663                    calls.push(((i - offset) as i64, AltType::SNP, r.to_string()));
664                }
665            }
666        }
667
668        // I hate implict returns, but appease clippy
669        calls
670    }
671}
672
673/// Calculate the distance between two strings ignoring N characters
674///
675/// # Arguments
676/// - `reference`: &str - Reference sequence
677/// - `alternate`: &str - Alternate sequence
678///
679/// # Returns
680/// SNP distance between the two strings
681fn snp_dist(reference: &str, alternate: &str) -> i64 {
682    if reference.len() != alternate.len() {
683        panic!("Reference and alternate strings are not the same length!");
684    }
685    let mut dist = 0;
686    for i in 0..reference.len() {
687        let r = reference.chars().nth(i).unwrap();
688        let a = alternate.chars().nth(i).unwrap();
689        if r != 'N' && a != 'N' && r != a {
690            dist += 1;
691        }
692    }
693
694    // I hate implict returns, but appease clippy
695    dist
696}
697
698#[cfg(test)]
699mod tests {
700    use super::*;
701
702    macro_rules! assert_panics {
703        ($expression:expr) => {
704            let result = std::panic::catch_unwind(|| $expression);
705            assert!(result.is_err());
706        };
707    }
708
709    #[test]
710    fn test_snp_dist() {
711        // Some trivial cases
712        assert_eq!(snp_dist("ACGT", "ACGT"), 0);
713        assert_eq!(snp_dist("AAAAA", "AAAAA"), 0);
714        assert_eq!(snp_dist("ACGTACGT", "ACGTACGT"), 0);
715        assert_eq!(snp_dist("ACGT", "AAGT"), 1);
716        assert_eq!(snp_dist("ACGT", "AAAT"), 2);
717        assert_eq!(snp_dist("ACGT", "AAAA"), 3);
718        assert_eq!(snp_dist("ACGT", "NNNN"), 0);
719        assert_eq!(snp_dist("ACGT", "NAGT"), 1);
720
721        // Should panic
722        assert_panics!(snp_dist("ACGT", "ACG"));
723        assert_panics!(snp_dist("", "ACGT"));
724        assert_panics!(snp_dist("ACGT", ""));
725    }
726
727    #[test]
728    fn test_simplify_call() {
729        // Some trivial cases
730        assert_eq!(
731            VCFFile::simplify_call("ACGT".to_string(), "ACGT".to_string()),
732            vec![]
733        );
734        assert_eq!(
735            VCFFile::simplify_call("AAAAA".to_string(), "AAAAA".to_string()),
736            vec![]
737        );
738        assert_eq!(
739            VCFFile::simplify_call("ACGTACGT".to_string(), "ACGTACGT".to_string()),
740            vec![]
741        );
742        assert_eq!(
743            VCFFile::simplify_call("ACGT".to_string(), "AAGT".to_string()),
744            vec![(1, AltType::SNP, "A".to_string())]
745        );
746        assert_eq!(
747            VCFFile::simplify_call("ACGT".to_string(), "AAAT".to_string()),
748            vec![
749                (1, AltType::SNP, "A".to_string()),
750                (2, AltType::SNP, "A".to_string()),
751            ]
752        );
753        assert_eq!(
754            VCFFile::simplify_call("ACGT".to_string(), "AAAA".to_string()),
755            vec![
756                (1, AltType::SNP, "A".to_string()),
757                (2, AltType::SNP, "A".to_string()),
758                (3, AltType::SNP, "A".to_string()),
759            ]
760        );
761        // Some more complex cases
762        assert_eq!(
763            VCFFile::simplify_call("ACGT".to_string(), "ACG".to_string()),
764            vec![(3, AltType::DEL, "T".to_string())]
765        );
766        assert_eq!(
767            VCFFile::simplify_call("ACG".to_string(), "ACGT".to_string()),
768            vec![(2, AltType::INS, "T".to_string())]
769        );
770        assert_eq!(
771            VCFFile::simplify_call("ACGT".to_string(), "AGT".to_string()),
772            vec![(1, AltType::DEL, "C".to_string()),]
773        );
774
775        // Somewhat pathological cases
776        assert_eq!(
777            VCFFile::simplify_call("AAAAA".to_string(), "CCC".to_string()),
778            vec![
779                (3, AltType::DEL, "AA".to_string()),
780                (0, AltType::SNP, "C".to_string()),
781                (1, AltType::SNP, "C".to_string()),
782                (2, AltType::SNP, "C".to_string()),
783            ]
784        );
785
786        assert_eq!(
787            VCFFile::simplify_call("CCC".to_string(), "AAAAA".to_string()),
788            vec![
789                (2, AltType::INS, "AA".to_string()),
790                (0, AltType::SNP, "A".to_string()),
791                (1, AltType::SNP, "A".to_string()),
792                (2, AltType::SNP, "A".to_string()),
793            ]
794        );
795
796        assert_eq!(
797            VCFFile::simplify_call("ACGT".to_string(), "ACGAGT".to_string()),
798            vec![(2, AltType::INS, "AG".to_string()),]
799        );
800
801        assert_eq!(
802            VCFFile::simplify_call(
803                "AGTGCGCCTCCCGCGAGCAGACACAGAATCGCACTGCGCCGGCCCGGCGCGTGCGATTCTGTGTCTGCTT"
804                    .to_string(),
805                "AGCTC".to_string()
806            ),
807            vec![
808                (
809                    2,
810                    AltType::DEL,
811                    "TGCGCCTCCCGCGAGCAGACACAGAATCGCACTGCGCCGGCCCGGCGCGTGCGATTCTGTGTCTG".to_string()
812                ),
813                (69, AltType::SNP, "C".to_string()),
814            ]
815        );
816    }
817
818    #[test]
819    fn test_parse_record_for_calls() {
820        // Note the use of blocks to allow collapsing in vscode.
821        // Not necessary but makes it easier to read as this is a long test
822
823        // There's an almost infinite number of edge cases here, so we'll just test a few
824
825        // Ref call
826        {
827            let record = VCFRow {
828                position: 1,
829                reference: "A".to_string(),
830                alternative: vec!["T".to_string()],
831                filter: vec![],
832                fields: HashMap::from([
833                    ("GT".to_string(), vec!["0/0".to_string()]),
834                    ("COV".to_string(), vec!["1".to_string()]),
835                ]),
836                is_filter_pass: true,
837                is_complex: false,
838            };
839            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 1, 0);
840            assert_eq!(calls.len(), 1);
841            assert_eq!(minor_calls.len(), 0);
842            assert_eq!(calls[0].genome_index, 1);
843            assert_eq!(calls[0].reference, "a".to_string());
844            assert_eq!(calls[0].alt, "a".to_string());
845            assert_eq!(calls[0].call_type, AltType::REF);
846            assert_eq!(calls[0].cov, Some(1));
847            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(1.0)));
848            assert!(!calls[0].is_minor);
849            assert_eq!(calls[0].vcf_idx, Some(0));
850            assert_eq!(calls[0].vcf_row, 0);
851        }
852
853        // Alt call
854        {
855            let record = VCFRow {
856                position: 1,
857                reference: "A".to_string(),
858                alternative: vec!["T".to_string()],
859                filter: vec![],
860                fields: HashMap::from([
861                    ("GT".to_string(), vec!["1/1".to_string()]),
862                    ("COV".to_string(), vec!["1".to_string(), "2".to_string()]),
863                ]),
864                is_filter_pass: true,
865                is_complex: false,
866            };
867            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 1, 0);
868            assert_eq!(calls.len(), 1);
869            assert_eq!(minor_calls.len(), 0);
870            assert_eq!(calls[0].genome_index, 1);
871            assert_eq!(calls[0].reference, "a".to_string());
872            assert_eq!(calls[0].alt, "t".to_string());
873            assert_eq!(calls[0].call_type, AltType::SNP);
874            assert_eq!(calls[0].cov, Some(2));
875            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(2.0 / 3.0)));
876            assert!(!calls[0].is_minor);
877            assert_eq!(calls[0].vcf_idx, Some(1));
878            assert_eq!(calls[0].vcf_row, 0);
879        }
880
881        // Het call (including minor call)
882        {
883            let record = VCFRow {
884                position: 1,
885                reference: "A".to_string(),
886                alternative: vec!["T".to_string()],
887                filter: vec![],
888                fields: HashMap::from([
889                    ("GT".to_string(), vec!["0/1".to_string()]),
890                    ("COV".to_string(), vec!["1".to_string(), "3".to_string()]),
891                ]),
892                is_filter_pass: true,
893                is_complex: false,
894            };
895            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 2, 0);
896            assert_eq!(calls.len(), 1);
897            assert_eq!(minor_calls.len(), 1);
898
899            assert_eq!(calls[0].genome_index, 1);
900            assert_eq!(calls[0].reference, "a".to_string());
901            assert_eq!(calls[0].alt, "z".to_string());
902            assert_eq!(calls[0].call_type, AltType::HET);
903            assert_eq!(calls[0].cov, None);
904            assert_eq!(calls[0].frs, None);
905            assert!(!calls[0].is_minor);
906            assert_eq!(calls[0].vcf_idx, None);
907            assert_eq!(calls[0].vcf_row, 0);
908
909            assert_eq!(minor_calls[0].genome_index, 1);
910            assert_eq!(minor_calls[0].reference, "a".to_string());
911            assert_eq!(minor_calls[0].alt, "t".to_string());
912            assert_eq!(minor_calls[0].call_type, AltType::SNP);
913            assert_eq!(minor_calls[0].cov, Some(3));
914            assert_eq!(minor_calls[0].frs, Some(ordered_float::OrderedFloat(0.75)));
915            assert!(minor_calls[0].is_minor);
916            assert_eq!(minor_calls[0].vcf_idx, Some(1));
917            assert_eq!(minor_calls[0].vcf_row, 0);
918        }
919
920        // Null call (in a few forms)
921        // Null call due to low coverage
922        {
923            let record = VCFRow {
924                position: 1,
925                reference: "A".to_string(),
926                alternative: vec!["T".to_string()],
927                filter: vec![],
928                fields: HashMap::from([
929                    ("GT".to_string(), vec!["1/1".to_string()]),
930                    ("COV".to_string(), vec!["1".to_string()]),
931                ]),
932                is_filter_pass: true,
933                is_complex: false,
934            };
935            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 2, 0);
936            assert_eq!(calls.len(), 1);
937            assert_eq!(minor_calls.len(), 0);
938            assert_eq!(calls[0].genome_index, 1);
939            assert_eq!(calls[0].reference, "a".to_string());
940            assert_eq!(calls[0].alt, "x".to_string());
941            assert_eq!(calls[0].call_type, AltType::NULL);
942            assert_eq!(calls[0].cov, Some(1));
943            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(1.0)));
944            assert!(!calls[0].is_minor);
945            assert_eq!(calls[0].vcf_idx, None);
946            assert_eq!(calls[0].vcf_row, 0);
947        }
948
949        // Null call due calling null
950        {
951            let record = VCFRow {
952                position: 1,
953                reference: "A".to_string(),
954                alternative: vec!["T".to_string()],
955                filter: vec![],
956                fields: HashMap::from([
957                    ("GT".to_string(), vec!["./.".to_string()]),
958                    ("COV".to_string(), vec!["1".to_string()]),
959                ]),
960                is_filter_pass: true,
961                is_complex: false,
962            };
963            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 1, 0);
964            assert_eq!(calls.len(), 1);
965            assert_eq!(minor_calls.len(), 0);
966            assert_eq!(calls[0].genome_index, 1);
967            assert_eq!(calls[0].reference, "a".to_string());
968            assert_eq!(calls[0].alt, "x".to_string());
969            assert_eq!(calls[0].call_type, AltType::NULL);
970            assert_eq!(calls[0].cov, Some(1));
971            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(1.0)));
972            assert!(!calls[0].is_minor);
973            assert_eq!(calls[0].vcf_idx, None);
974            assert_eq!(calls[0].vcf_row, 0);
975        }
976
977        // Null call due to low coverage and filter fail (should still make it)
978        {
979            let record = VCFRow {
980                position: 1,
981                reference: "A".to_string(),
982                alternative: vec!["T".to_string()],
983                filter: vec!["MIN_DP".to_string()],
984                fields: HashMap::from([
985                    ("GT".to_string(), vec!["1/1".to_string()]),
986                    ("COV".to_string(), vec!["1".to_string()]),
987                ]),
988                is_filter_pass: false,
989                is_complex: false,
990            };
991            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 2, 0);
992            assert_eq!(calls.len(), 1);
993            assert_eq!(minor_calls.len(), 0);
994            assert_eq!(calls[0].genome_index, 1);
995            assert_eq!(calls[0].reference, "a".to_string());
996            assert_eq!(calls[0].alt, "x".to_string());
997            assert_eq!(calls[0].call_type, AltType::NULL);
998            assert_eq!(calls[0].cov, Some(1));
999            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(1.0)));
1000            assert!(!calls[0].is_minor);
1001            assert_eq!(calls[0].vcf_idx, None);
1002            assert_eq!(calls[0].vcf_row, 0);
1003        }
1004
1005        // Ins call
1006        {
1007            let record = VCFRow {
1008                position: 1,
1009                reference: "A".to_string(),
1010                alternative: vec!["AT".to_string()],
1011                filter: vec![],
1012                fields: HashMap::from([
1013                    ("GT".to_string(), vec!["1/1".to_string()]),
1014                    ("COV".to_string(), vec!["1".to_string(), "5".to_string()]),
1015                ]),
1016                is_filter_pass: true,
1017                is_complex: false,
1018            };
1019            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 3, 123);
1020            assert_eq!(calls.len(), 1);
1021            assert_eq!(minor_calls.len(), 0);
1022            assert_eq!(calls[0].genome_index, 1);
1023            assert_eq!(calls[0].reference, "a".to_string());
1024            assert_eq!(calls[0].alt, "t".to_string());
1025            assert_eq!(calls[0].call_type, AltType::INS);
1026            assert_eq!(calls[0].cov, Some(5));
1027            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(5.0 / 6.0)));
1028            assert!(!calls[0].is_minor);
1029            assert_eq!(calls[0].vcf_idx, Some(1));
1030            assert_eq!(calls[0].vcf_row, 123);
1031        }
1032
1033        // Del call
1034        {
1035            let record = VCFRow {
1036                position: 1,
1037                reference: "AT".to_string(),
1038                alternative: vec!["A".to_string()],
1039                filter: vec![],
1040                fields: HashMap::from([
1041                    ("GT".to_string(), vec!["1/1".to_string()]),
1042                    ("COV".to_string(), vec!["1".to_string(), "5".to_string()]),
1043                ]),
1044                is_filter_pass: true,
1045                is_complex: false,
1046            };
1047            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 3, 0);
1048            assert_eq!(calls.len(), 1);
1049            assert_eq!(minor_calls.len(), 0);
1050            assert_eq!(calls[0].genome_index, 2);
1051            assert_eq!(calls[0].reference, "t".to_string());
1052            assert_eq!(calls[0].alt, "t".to_string());
1053            assert_eq!(calls[0].call_type, AltType::DEL);
1054            assert_eq!(calls[0].cov, Some(5));
1055            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(5.0 / 6.0)));
1056            assert!(!calls[0].is_minor);
1057            assert_eq!(calls[0].vcf_idx, Some(1));
1058            assert_eq!(calls[0].vcf_row, 0);
1059        }
1060
1061        // Del call with SNP
1062        {
1063            let record = VCFRow {
1064                position: 1,
1065                reference: "AT".to_string(),
1066                alternative: vec!["C".to_string()],
1067                filter: vec![],
1068                fields: HashMap::from([
1069                    ("GT".to_string(), vec!["1/1".to_string()]),
1070                    ("COV".to_string(), vec!["1".to_string(), "5".to_string()]),
1071                ]),
1072                is_filter_pass: true,
1073                is_complex: false,
1074            };
1075            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 3, 0);
1076            assert_eq!(calls.len(), 2);
1077            assert_eq!(minor_calls.len(), 0);
1078            assert_eq!(calls[0].genome_index, 2);
1079            assert_eq!(calls[0].reference, "t".to_string());
1080            assert_eq!(calls[0].alt, "t".to_string());
1081            assert_eq!(calls[0].call_type, AltType::DEL);
1082            assert_eq!(calls[0].cov, Some(5));
1083            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(5.0 / 6.0)));
1084            assert!(!calls[0].is_minor);
1085            assert_eq!(calls[0].vcf_idx, Some(1));
1086
1087            assert_eq!(calls[1].genome_index, 1);
1088            assert_eq!(calls[1].reference, "a".to_string());
1089            assert_eq!(calls[1].alt, "c".to_string());
1090            assert_eq!(calls[1].call_type, AltType::SNP);
1091            assert_eq!(calls[1].cov, Some(5));
1092            assert_eq!(calls[1].frs, Some(ordered_float::OrderedFloat(5.0 / 6.0)));
1093            assert!(!calls[1].is_minor);
1094            assert_eq!(calls[1].vcf_idx, Some(1));
1095            assert_eq!(calls[0].vcf_row, 0);
1096        }
1097
1098        // More complex mix of SNP and indel minors
1099        {
1100            let record = VCFRow {
1101                position: 1,
1102                reference: "AT".to_string(),
1103                alternative: vec!["C".to_string(), "GCC".to_string()],
1104                filter: vec![],
1105                fields: HashMap::from([
1106                    ("GT".to_string(), vec!["1/1".to_string()]),
1107                    (
1108                        "COV".to_string(),
1109                        vec!["1".to_string(), "57".to_string(), "12".to_string()],
1110                    ),
1111                ]),
1112                is_filter_pass: true,
1113                is_complex: false,
1114            };
1115            let (calls, minor_calls) = VCFFile::parse_record_for_calls(record, 3, 0);
1116            assert_eq!(calls.len(), 2);
1117            assert_eq!(minor_calls.len(), 3);
1118
1119            assert_eq!(calls[0].genome_index, 2);
1120            assert_eq!(calls[0].reference, "t".to_string());
1121            assert_eq!(calls[0].alt, "t".to_string());
1122            assert_eq!(calls[0].call_type, AltType::DEL);
1123            assert_eq!(calls[0].cov, Some(57));
1124            assert_eq!(calls[0].frs, Some(ordered_float::OrderedFloat(57.0 / 70.0)));
1125            assert!(!calls[0].is_minor);
1126            assert_eq!(calls[0].vcf_idx, Some(1));
1127            assert_eq!(calls[0].vcf_row, 0);
1128
1129            assert_eq!(calls[1].genome_index, 1);
1130            assert_eq!(calls[1].reference, "a".to_string());
1131            assert_eq!(calls[1].alt, "c".to_string());
1132            assert_eq!(calls[1].call_type, AltType::SNP);
1133            assert_eq!(calls[1].cov, Some(57));
1134            assert_eq!(calls[1].frs, Some(ordered_float::OrderedFloat(57.0 / 70.0)));
1135            assert!(!calls[1].is_minor);
1136            assert_eq!(calls[1].vcf_idx, Some(1));
1137            assert_eq!(calls[1].vcf_row, 0);
1138
1139            assert_eq!(minor_calls[0].genome_index, 2);
1140            assert_eq!(minor_calls[0].reference, "t".to_string());
1141            assert_eq!(minor_calls[0].alt, "c".to_string());
1142            assert_eq!(minor_calls[0].call_type, AltType::INS);
1143            assert_eq!(minor_calls[0].cov, Some(12));
1144            assert_eq!(
1145                minor_calls[0].frs,
1146                Some(ordered_float::OrderedFloat(12.0 / 70.0))
1147            );
1148            assert!(minor_calls[0].is_minor);
1149            assert_eq!(minor_calls[0].vcf_idx, Some(2));
1150            assert_eq!(minor_calls[0].vcf_row, 0);
1151
1152            assert_eq!(minor_calls[1].genome_index, 1);
1153            assert_eq!(minor_calls[1].reference, "a".to_string());
1154            assert_eq!(minor_calls[1].alt, "g".to_string());
1155            assert_eq!(minor_calls[1].call_type, AltType::SNP);
1156            assert_eq!(minor_calls[1].cov, Some(12));
1157            assert_eq!(
1158                minor_calls[1].frs,
1159                Some(ordered_float::OrderedFloat(12.0 / 70.0))
1160            );
1161            assert!(minor_calls[1].is_minor);
1162            assert_eq!(minor_calls[1].vcf_idx, Some(2));
1163            assert_eq!(minor_calls[1].vcf_row, 0);
1164
1165            assert_eq!(minor_calls[2].genome_index, 2);
1166            assert_eq!(minor_calls[2].reference, "t".to_string());
1167            assert_eq!(minor_calls[2].alt, "c".to_string());
1168            assert_eq!(minor_calls[2].call_type, AltType::SNP);
1169            assert_eq!(minor_calls[2].cov, Some(12));
1170            assert_eq!(
1171                minor_calls[2].frs,
1172                Some(ordered_float::OrderedFloat(12.0 / 70.0))
1173            );
1174            assert!(minor_calls[2].is_minor);
1175            assert_eq!(minor_calls[2].vcf_idx, Some(2));
1176            assert_eq!(minor_calls[2].vcf_row, 0);
1177        }
1178    }
1179
1180    #[test]
1181    fn test_instanciate_vcffile() {
1182        let vcf = VCFFile::new("test/dummy.vcf".to_string(), false, 3);
1183        let expected_records = vec![
1184            VCFRow {
1185                position: 4687,
1186                reference: "t".to_string(),
1187                alternative: vec!["c".to_string()],
1188                filter: vec!["PASS".to_string()],
1189                fields: HashMap::from([
1190                    ("GT".to_string(), vec!["1/1".to_string()]),
1191                    ("DP".to_string(), vec!["68".to_string()]),
1192                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1193                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1194                ]),
1195                is_filter_pass: true,
1196                is_complex: false,
1197            },
1198            VCFRow {
1199                position: 4725,
1200                reference: "t".to_string(),
1201                alternative: vec!["c".to_string()],
1202                filter: vec![],
1203                fields: HashMap::from([
1204                    ("GT".to_string(), vec!["1/1".to_string()]),
1205                    ("DP".to_string(), vec!["68".to_string()]),
1206                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1207                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1208                ]),
1209                is_filter_pass: false,
1210                is_complex: false,
1211            },
1212            VCFRow {
1213                position: 4730,
1214                reference: "c".to_string(),
1215                alternative: vec!["t".to_string(), "g".to_string()],
1216                filter: vec!["PASS".to_string()],
1217                fields: HashMap::from([
1218                    ("GT".to_string(), vec!["1/2".to_string()]),
1219                    ("DP".to_string(), vec!["200".to_string()]),
1220                    (
1221                        "COV".to_string(),
1222                        vec!["1".to_string(), "99".to_string(), "100".to_string()],
1223                    ),
1224                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1225                ]),
1226                is_filter_pass: true,
1227                is_complex: false,
1228            },
1229            VCFRow {
1230                position: 4735,
1231                reference: "g".to_string(),
1232                alternative: vec!["gcc".to_string()],
1233                filter: vec!["PASS".to_string()],
1234                fields: HashMap::from([
1235                    ("GT".to_string(), vec!["1/1".to_string()]),
1236                    ("DP".to_string(), vec!["68".to_string()]),
1237                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1238                    ("GT_CONF".to_string(), vec!["63.77".to_string()]),
1239                ]),
1240                is_filter_pass: true,
1241                is_complex: false,
1242            },
1243            VCFRow {
1244                position: 4740,
1245                reference: "c".to_string(),
1246                alternative: vec!["gtt".to_string()],
1247                filter: vec![],
1248                fields: HashMap::from([
1249                    ("GT".to_string(), vec!["1/1".to_string()]),
1250                    ("DP".to_string(), vec!["68".to_string()]),
1251                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1252                    ("GT_CONF".to_string(), vec!["63.77".to_string()]),
1253                ]),
1254                is_filter_pass: false,
1255                is_complex: false,
1256            },
1257            VCFRow {
1258                position: 13148,
1259                reference: "t".to_string(),
1260                alternative: vec!["g".to_string()],
1261                filter: vec![],
1262                fields: HashMap::from([
1263                    ("GT".to_string(), vec!["./.".to_string()]),
1264                    ("DP".to_string(), vec!["68".to_string()]),
1265                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1266                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1267                ]),
1268                is_filter_pass: false,
1269                is_complex: false,
1270            },
1271            VCFRow {
1272                position: 13149,
1273                reference: "g".to_string(),
1274                alternative: vec!["t".to_string()],
1275                filter: vec!["PASS".to_string()],
1276                fields: HashMap::from([
1277                    ("GT".to_string(), vec!["./.".to_string()]),
1278                    ("DP".to_string(), vec!["68".to_string()]),
1279                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1280                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1281                ]),
1282                is_filter_pass: true,
1283                is_complex: false,
1284            },
1285            VCFRow {
1286                position: 13150,
1287                reference: "a".to_string(),
1288                alternative: vec!["tcg".to_string()],
1289                filter: vec!["PASS".to_string()],
1290                fields: HashMap::from([
1291                    ("GT".to_string(), vec!["./.".to_string()]),
1292                    ("DP".to_string(), vec!["68".to_string()]),
1293                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1294                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1295                ]),
1296                is_filter_pass: true,
1297                is_complex: false,
1298            },
1299            VCFRow {
1300                position: 13333,
1301                reference: "c".to_string(),
1302                alternative: vec!["a".to_string(), "g".to_string()],
1303                filter: vec![],
1304                fields: HashMap::from([
1305                    ("GT".to_string(), vec!["1/2".to_string()]),
1306                    ("DP".to_string(), vec!["100".to_string()]),
1307                    (
1308                        "COV".to_string(),
1309                        vec!["2".to_string(), "50".to_string(), "48".to_string()],
1310                    ),
1311                    ("GT_CONF".to_string(), vec!["613".to_string()]),
1312                ]),
1313                is_filter_pass: false,
1314                is_complex: false,
1315            },
1316            VCFRow {
1317                // Edge case of using `RO` and `AO` for coverage
1318                position: 13335,
1319                reference: "t".to_string(),
1320                alternative: vec!["a".to_string()],
1321                filter: vec!["MAX_DP".to_string()],
1322                fields: HashMap::from([
1323                    ("GT".to_string(), vec!["1/1".to_string()]),
1324                    ("DP".to_string(), vec!["68".to_string()]),
1325                    ("RO".to_string(), vec!["66".to_string()]),
1326                    ("AO".to_string(), vec!["2".to_string()]),
1327                    ("COV".to_string(), vec!["66".to_string(), "2".to_string()]),
1328                    ("GT_CONF".to_string(), vec!["3.77".to_string()]),
1329                ]),
1330                is_filter_pass: false,
1331                is_complex: false,
1332            },
1333            VCFRow {
1334                // Odd edge case which is a valid VCF row where it's a het GT but single COV value
1335                position: 13336,
1336                reference: "c".to_string(),
1337                alternative: vec!["a".to_string()],
1338                filter: vec![],
1339                fields: HashMap::from([
1340                    ("GT".to_string(), vec!["0/1".to_string()]),
1341                    ("DP".to_string(), vec!["50".to_string()]),
1342                    ("COV".to_string(), vec!["50".to_string()]),
1343                    ("GT_CONF".to_string(), vec!["613".to_string()]),
1344                ]),
1345                is_filter_pass: false,
1346                is_complex: false,
1347            },
1348            VCFRow {
1349                // Null call by virtue of failing specific filter
1350                position: 14000,
1351                reference: "c".to_string(),
1352                alternative: vec!["a".to_string()],
1353                filter: vec!["MIN_QUAL".to_string(), "MIN_FRS".to_string()],
1354                fields: HashMap::from([
1355                    ("GT".to_string(), vec!["1/1".to_string()]),
1356                    ("DP".to_string(), vec!["50".to_string()]),
1357                    ("COV".to_string(), vec!["50".to_string()]),
1358                    ("GT_CONF".to_string(), vec!["613".to_string()]),
1359                ]),
1360                is_filter_pass: false,
1361                is_complex: false,
1362            },
1363        ];
1364        for (idx, record) in expected_records.iter().enumerate() {
1365            assert_eq!(record.position, vcf.records[idx].position);
1366            assert_eq!(record.reference, vcf.records[idx].reference);
1367            assert_eq!(record.alternative, vcf.records[idx].alternative);
1368            assert_eq!(record.filter, vcf.records[idx].filter);
1369            assert_eq!(record.fields, vcf.records[idx].fields);
1370            assert_eq!(record.is_filter_pass, vcf.records[idx].is_filter_pass);
1371        }
1372
1373        let expected_calls = [
1374            vec![Evidence {
1375                cov: Some(68),
1376                frs: Some(ordered_float::OrderedFloat(1.0)),
1377                genotype: "1/1".to_string(),
1378                call_type: AltType::SNP,
1379                vcf_row: 0,
1380                reference: "t".to_string(),
1381                alt: "c".to_string(),
1382                genome_index: 4687,
1383                is_minor: false,
1384                vcf_idx: Some(1),
1385            }],
1386            vec![Evidence {
1387                cov: None,
1388                frs: None,
1389                genotype: "1/2".to_string(),
1390                call_type: AltType::HET,
1391                vcf_row: 2,
1392                reference: "c".to_string(),
1393                alt: "z".to_string(),
1394                genome_index: 4730,
1395                is_minor: false,
1396                vcf_idx: None,
1397            }],
1398            vec![Evidence {
1399                cov: Some(68),
1400                frs: Some(ordered_float::OrderedFloat(1.0)),
1401                genotype: "1/1".to_string(),
1402                call_type: AltType::INS,
1403                vcf_row: 3,
1404                reference: "g".to_string(),
1405                alt: "cc".to_string(),
1406                genome_index: 4735,
1407                is_minor: false,
1408                vcf_idx: Some(1),
1409            }],
1410            vec![Evidence {
1411                cov: None,
1412                frs: None,
1413                genotype: "./.".to_string(),
1414                call_type: AltType::NULL,
1415                vcf_row: 5,
1416                reference: "t".to_string(),
1417                alt: "x".to_string(),
1418                genome_index: 13148,
1419                is_minor: false,
1420                vcf_idx: None,
1421            }],
1422            vec![Evidence {
1423                cov: None,
1424                frs: None,
1425                genotype: "./.".to_string(),
1426                call_type: AltType::NULL,
1427                vcf_row: 6,
1428                reference: "g".to_string(),
1429                alt: "x".to_string(),
1430                genome_index: 13149,
1431                is_minor: false,
1432                vcf_idx: None,
1433            }],
1434            vec![Evidence {
1435                cov: None,
1436                frs: None,
1437                genotype: "./.".to_string(),
1438                call_type: AltType::NULL,
1439                vcf_row: 7,
1440                reference: "a".to_string(),
1441                alt: "x".to_string(),
1442                genome_index: 13150,
1443                is_minor: false,
1444                vcf_idx: None,
1445            }],
1446            // Null call ignoring filter fails
1447            vec![Evidence {
1448                cov: Some(2),
1449                frs: Some(ordered_float::OrderedFloat(0.029411765)),
1450                genotype: "1/1".to_string(),
1451                call_type: AltType::NULL,
1452                vcf_row: 9,
1453                reference: "t".to_string(),
1454                alt: "x".to_string(),
1455                genome_index: 13335,
1456                is_minor: false,
1457                vcf_idx: Some(1),
1458            }],
1459            // Null call caused by specific filter fail
1460            vec![Evidence {
1461                cov: Some(0),
1462                frs: Some(ordered_float::OrderedFloat(0.0)),
1463                genotype: "1/1".to_string(),
1464                call_type: AltType::NULL,
1465                vcf_row: 11,
1466                reference: "c".to_string(),
1467                alt: "x".to_string(),
1468                genome_index: 14000,
1469                is_minor: false,
1470                vcf_idx: None,
1471            }],
1472        ];
1473
1474        for calls in expected_calls.iter() {
1475            let actual = vcf.calls.get(&calls[0].genome_index).unwrap();
1476            for (idx, call) in calls.iter().enumerate() {
1477                assert_eq!(call.cov, actual[idx].cov);
1478                assert_eq!(call.frs, actual[idx].frs);
1479                assert_eq!(call.genotype, actual[idx].genotype);
1480                assert_eq!(call.call_type, actual[idx].call_type);
1481                assert_eq!(call.vcf_row, actual[idx].vcf_row);
1482                assert_eq!(call.reference, actual[idx].reference);
1483                assert_eq!(call.alt, actual[idx].alt);
1484                assert_eq!(call.genome_index, actual[idx].genome_index);
1485                assert_eq!(call.is_minor, actual[idx].is_minor);
1486                assert_eq!(call.vcf_idx, actual[idx].vcf_idx);
1487            }
1488        }
1489        assert_eq!(vcf.calls.keys().len(), expected_calls.len());
1490
1491        let expected_minor_calls = [vec![
1492            Evidence {
1493                cov: Some(99),
1494                frs: Some(ordered_float::OrderedFloat(0.495)),
1495                genotype: "1/2".to_string(),
1496                call_type: AltType::SNP,
1497                vcf_row: 2,
1498                reference: "c".to_string(),
1499                alt: "t".to_string(),
1500                genome_index: 4730,
1501                is_minor: true,
1502                vcf_idx: Some(1),
1503            },
1504            Evidence {
1505                cov: Some(100),
1506                frs: Some(ordered_float::OrderedFloat(0.5)),
1507                genotype: "1/2".to_string(),
1508                call_type: AltType::SNP,
1509                vcf_row: 2,
1510                reference: "c".to_string(),
1511                alt: "g".to_string(),
1512                genome_index: 4730,
1513                is_minor: true,
1514                vcf_idx: Some(2),
1515            },
1516        ]];
1517
1518        for calls in expected_minor_calls.iter() {
1519            let actual = vcf.minor_calls.get(&calls[0].genome_index).unwrap();
1520            for (idx, call) in calls.iter().enumerate() {
1521                assert_eq!(call.cov, actual[idx].cov);
1522                assert_eq!(call.frs, actual[idx].frs);
1523                assert_eq!(call.genotype, actual[idx].genotype);
1524                assert_eq!(call.call_type, actual[idx].call_type);
1525                assert_eq!(call.vcf_row, actual[idx].vcf_row);
1526                assert_eq!(call.reference, actual[idx].reference);
1527                assert_eq!(call.alt, actual[idx].alt);
1528                assert_eq!(call.genome_index, actual[idx].genome_index);
1529                assert_eq!(call.is_minor, actual[idx].is_minor);
1530                assert_eq!(call.vcf_idx, actual[idx].vcf_idx);
1531            }
1532        }
1533    }
1534
1535    #[test]
1536    fn test_complex_is_detected() {
1537        // Check that the is_complex flag gets set if a VCF has a complex row in it
1538        // This VCF has 1 standard row, then 2 almost complex rows, then a complex row
1539        // Standard SNP at 4687
1540        // Ref of length 1000 with 2 alts at 4730
1541        // Ref of length 1001 with 1 alt at 5730
1542        // Ref of length 1001 with 2 alts at 7030 <-- this is the complex row
1543
1544        let vcf = VCFFile::new("test/complex.vcf".to_string(), false, 3);
1545
1546        let expected_records = vec![
1547            VCFRow {
1548                position: 4687,
1549                reference: "t".to_string(),
1550                alternative: vec!["c".to_string()],
1551                filter: vec!["PASS".to_string()],
1552                fields: HashMap::from([
1553                    ("GT".to_string(), vec!["1/1".to_string()]),
1554                    ("DP".to_string(), vec!["68".to_string()]),
1555                    ("COV".to_string(), vec!["0".to_string(), "68".to_string()]),
1556                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1557                ]),
1558                is_filter_pass: true,is_complex: false,
1559            },
1560            VCFRow {
1561                position: 4730,
1562                reference: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".to_string(),
1563                alternative: vec!["t".to_string(), "g".to_string()],
1564                filter: vec!["PASS".to_string()],
1565                fields: HashMap::from([
1566                    ("GT".to_string(), vec!["1/1".to_string()]),
1567                    ("DP".to_string(), vec!["200".to_string()]),
1568                    ("COV".to_string(), vec!["1".to_string(), "99".to_string(), "100".to_string()]),
1569                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1570                ]),
1571                is_filter_pass: true,is_complex: false,
1572            },
1573            VCFRow {
1574                position: 5730,
1575                reference: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".to_string(),
1576                alternative: vec!["t".to_string()],
1577                filter: vec!["PASS".to_string()],
1578                fields: HashMap::from([
1579                    ("GT".to_string(), vec!["1/1".to_string()]),
1580                    ("DP".to_string(), vec!["200".to_string()]),
1581                    ("COV".to_string(), vec!["1".to_string(), "99".to_string()]),
1582                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1583                ]),
1584                is_filter_pass: true,is_complex: false,
1585            },
1586            VCFRow {
1587                position: 7030,
1588                reference: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".to_string(),
1589                alternative: vec!["t".to_string(), "g".to_string()],
1590                filter: vec!["PASS".to_string()],
1591                fields: HashMap::from([
1592                    ("GT".to_string(), vec!["2/2".to_string()]),
1593                    ("DP".to_string(), vec!["200".to_string()]),
1594                    ("COV".to_string(), vec!["1".to_string(), "99".to_string(), "100".to_string()]),
1595                    ("GT_CONF".to_string(), vec!["613.77".to_string()]),
1596                ]),
1597                is_filter_pass: true,is_complex: true,
1598            },
1599        ];
1600
1601        for (idx, record) in expected_records.iter().enumerate() {
1602            assert_eq!(record.position, vcf.records[idx].position);
1603            assert_eq!(record.reference, vcf.records[idx].reference);
1604            assert_eq!(record.alternative, vcf.records[idx].alternative);
1605            assert_eq!(record.filter, vcf.records[idx].filter);
1606            assert_eq!(record.fields, vcf.records[idx].fields);
1607            assert_eq!(record.is_filter_pass, vcf.records[idx].is_filter_pass);
1608        }
1609    }
1610}