tidyvcf 0.7.3

command-line tool to convert VCF files to tab/comma separated tables
Documentation
use indexmap::IndexSet;
use noodles::core::position::Position;
use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
use noodles::vcf::variant::record::samples::series::value::Genotype;
use noodles::vcf::variant::{self, RecordBuf};

use std::error::Error;
use std::path::Path;

use crate::cli::Opt;
use crate::consts::VEP_DESC_PREFIX;
use crate::consts::VEP_FIELD_NAME;

pub fn split_header_csq_fields(csq: &str) -> Vec<String> {
    let (_, csq_fields) = csq.split_at(VEP_DESC_PREFIX.len());
    csq_fields.split('|').map(ToString::to_string).collect()
}

pub fn get_fmt_field(
    key: &String,
    sample: &variant::record_buf::samples::Sample,
    missing_string: &String,
) -> Result<String, Box<dyn Error>> {
    Ok(if let Some(Some(field)) = sample.get(key) {
        fmt_field_string(field)?
    } else {
        missing_string.to_owned()
    })
}

pub fn is_path_gz(path: &Path) -> bool {
    path.extension() == Some(std::ffi::OsStr::new("gz"))
}

pub fn contig_string(contig: &str) -> String {
    contig.to_owned()
}

pub fn push_fmts_cartesian(
    fields: &mut Vec<String>,
    record: &RecordBuf,
    fmt_keys: &Vec<&String>,
    opt: &Opt,
) -> Result<(), Box<dyn Error>> {
    for sample_fmt in record.samples().values() {
        for key in fmt_keys {
            fields.push(get_fmt_field(key, &sample_fmt, &opt.missing_string)?);
        }
    }
    Ok(())
}

pub fn push_fmts_stacked(
    record: &RecordBuf,
    fmt_keys: &Vec<&String>,
    opt: &Opt,
    samples: &IndexSet<String>,
) -> Result<Vec<Vec<String>>, Box<dyn Error>> {
    let mut rows = Vec::new();
    for _ in samples {
        rows.push(Vec::new());
    }
    if record
        .samples()
        .values()
        .zip(samples)
        .peekable()
        .peek()
        .is_some()
    {
        for ((sample_fmt, sample), row) in
            record.samples().values().zip(samples).zip(rows.iter_mut())
        {
            for key in fmt_keys {
                row.push(get_fmt_field(key, &sample_fmt, &opt.missing_string)?);
            }
            row.push(sample.clone());
        }
    } else {
        return Ok(rows);
    }
    Ok(rows)
}

fn info_field_string(field: &variant::record_buf::info::field::Value) -> String {
    match field {
        variant::record_buf::info::field::Value::Integer(i) => i.to_string(),
        variant::record_buf::info::field::Value::Float(f) => f.to_string(),
        variant::record_buf::info::field::Value::Flag => "true".to_owned(),
        variant::record_buf::info::field::Value::Character(c) => c.to_string(),
        variant::record_buf::info::field::Value::String(s) => s.clone(),
        variant::record_buf::info::field::Value::Array(a) => info_array_string(a),
    }
}

fn genotype_string(
    g: &variant::record_buf::samples::sample::value::Genotype,
) -> Result<String, Box<dyn Error>> {
    let alleles: Vec<(Option<usize>, Phasing)> = g.iter().collect::<Result<Vec<_>, _>>()?;
    let phasing: Vec<&Phasing> = alleles.iter().map(|(_, p)| p).collect();
    if !phasing.windows(2).all(|w| w[0] == w[1]) {
        return Err(Box::from(
            "Thought unreachable, inconsistent phasing between alleles in genotype!",
        ));
    };
    if alleles.len() == 1 {
        return Ok(if let Some(n) = alleles[0].0 {
            format!("{}", n)
        } else {
            ".".to_string()
        });
    }
    let phased = *phasing[1] == Phasing::Phased;
    let indices: String = alleles
        .into_iter()
        .map(|o| o.0.unwrap_or_default())
        .map(|u| u.to_string())
        .collect::<Vec<String>>()
        .join(if phased { "|" } else { "/" });

    Ok(indices)
}

fn fmt_field_string(
    field: &variant::record_buf::samples::sample::Value,
) -> Result<String, Box<dyn Error>> {
    Ok(match field {
        variant::record_buf::samples::sample::Value::Integer(i) => i.to_string(),
        variant::record_buf::samples::sample::Value::Float(f) => f.to_string(),
        variant::record_buf::samples::sample::Value::Character(c) => c.to_string(),
        variant::record_buf::samples::sample::Value::Genotype(g) => genotype_string(g)?,
        variant::record_buf::samples::sample::Value::String(s) => s.clone(),
        variant::record_buf::samples::sample::Value::Array(a) => fmt_array_string(a),
    })
}

fn info_array_string(array: &variant::record_buf::info::field::value::Array) -> String {
    match array {
        variant::record_buf::info::field::value::Array::Integer(v) => vec_option_string(v),
        variant::record_buf::info::field::value::Array::Float(v) => vec_option_string(v),
        variant::record_buf::info::field::value::Array::Character(v) => vec_option_string(v),
        variant::record_buf::info::field::value::Array::String(v) => vec_option_string(v),
    }
}

fn fmt_array_string(array: &variant::record_buf::samples::sample::value::Array) -> String {
    match array {
        variant::record_buf::samples::sample::value::Array::Integer(v) => vec_option_string(v),
        variant::record_buf::samples::sample::value::Array::Float(v) => vec_option_string(v),
        variant::record_buf::samples::sample::value::Array::Character(v) => vec_option_string(v),
        variant::record_buf::samples::sample::value::Array::String(v) => vec_option_string(v),
    }
}

fn vec_option_string<T: ToString>(vec: &[Option<T>]) -> String {
    vec.iter()
        .map(|e| match e {
            Some(elem) => elem.to_string(),
            None => ".".to_owned(),
        })
        .collect::<Vec<String>>()
        .join(",")
}

pub fn push_info_fields(
    info_keys: &Vec<&String>,
    info: &variant::record_buf::Info,
    fields: &mut Vec<String>,
    split_csq: usize,
    opt: &Opt,
) {
    for info_field in info_keys {
        match info_field {
            // CSQ field and we want to split it
            key if split_csq != 0 && *key == VEP_FIELD_NAME => {
                let mut subfields: Vec<String> = if let Some(Some(field)) = info.get(*info_field) {
                    match field {
                        crate::utils::variant::record_buf::info::field::Value::String(s) => {
                            s.splitn(split_csq, '|').map(ToString::to_string).collect()
                        }
                        crate::utils::variant::record_buf::info::field::Value::Array(
                            crate::utils::variant::record_buf::info::field::value::Array::String(
                                sa,
                            ),
                        ) => {
                            let mut fields_string = String::new();
                            for string in sa.iter().flatten() {
                                fields_string.push_str(string);
                                fields_string.push('|');
                            }
                            fields_string
                                .splitn(split_csq, '|')
                                .map(ToString::to_string)
                                .collect()
                        }
                        _ => panic!("CSQ field not string(s)!"),
                    }
                } else {
                    Vec::new()
                };
                if subfields.len() < split_csq {
                    for _ in 0..(split_csq - subfields.len()) {
                        subfields.push(opt.missing_string.to_owned());
                    }
                }
                fields.append(&mut subfields);
            }
            // non-CSQ field
            _ => fields.push(match info.get(*info_field) {
                Some(None) | None => opt.missing_string.to_owned(),
                Some(Some(field)) => info_field_string(field),
            }),
        }
    }
}

fn variant_start_string(pos: Option<Position>) -> String {
    match pos {
        Some(p) => p.get().to_string(),
        None => format!("{}", 0),
    }
}

pub fn main_fields_from_record(record: &RecordBuf, opt: &Opt) -> Vec<String> {
    let mut main_fields = Vec::new();
    if opt.include_column("contig") {
        main_fields.push(contig_string(record.reference_sequence_name()));
    }
    if opt.include_column("pos") {
        main_fields.push(variant_start_string(record.variant_start()));
    }
    if opt.include_column("id") {
        main_fields.push(
            record
                .ids()
                .as_ref()
                .iter()
                .map(ToString::to_string)
                .collect::<Vec<String>>()
                .join(","),
        );
    }
    if opt.include_column("ref") {
        main_fields.push(record.reference_bases().to_owned());
    }

    if opt.include_column("alt") {
        main_fields.push(record.alternate_bases().as_ref().join(","));
    }

    if opt.include_column("qual") {
        main_fields.push(if let Some(qual) = record.quality_score() {
            format!("{qual}")
        } else {
            opt.missing_string.to_owned()
        })
    }
    if opt.include_column("filter") {
        let filtersfield = record.filters();
        let filters = if filtersfield.is_pass() {
            "PASS".to_string()
        } else {
            filtersfield
                .as_ref()
                .iter()
                .map(ToString::to_string)
                .collect::<Vec<String>>()
                .join(",")
        };
        main_fields.push(filters);
    }
    main_fields
}

#[cfg(test)]
mod tests {}