varlociraptor/
cli.rs

1// Copyright 2016-2019 Johannes Köster, David Lähnemann.
2// Licensed under the GNU GPLv3 license (https://opensource.org/licenses/GPL-3.0)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6use std::collections::HashMap;
7use std::convert::{From, TryFrom};
8use std::fs::File;
9use std::path::{Path, PathBuf};
10use std::str::FromStr;
11use std::sync::Arc;
12
13use crate::calling;
14use crate::calling::variants::calling::{
15    call_generic, CallWriter, DefaultCandidateFilter, SampleInfos,
16};
17use crate::calling::variants::preprocessing::haplotype_feature_index::HaplotypeFeatureIndex;
18use crate::candidates;
19use crate::candidates::methylation::MethylationMotif;
20use crate::conversion;
21use crate::errors;
22use crate::estimation;
23use crate::estimation::alignment_properties::AlignmentProperties;
24use anyhow::{bail, Context, Result};
25use bio::stats::bayesian::bayes_factors::evidence::KassRaftery;
26use bio::stats::{LogProb, Prob};
27use itertools::Itertools;
28use structopt::StructOpt;
29use strum::IntoEnumIterator;
30//use crate::estimation::sample_variants;
31//use crate::estimation::tumor_mutational_burden;
32use crate::filtration;
33use crate::grammar;
34use crate::reference;
35use crate::testcase;
36use crate::utils::PathMap;
37use crate::variants::evidence::realignment;
38
39use crate::variants::model::prior::CheckablePrior;
40use crate::variants::model::prior::Prior;
41use crate::variants::model::{AlleleFreq, VariantType};
42use crate::variants::sample::{estimate_alignment_properties, MethylationReadtype};
43
44use crate::SimpleEvent;
45
46#[derive(Debug, StructOpt, Serialize, Deserialize, Clone)]
47#[structopt(
48    name = "varlociraptor",
49    about = "Flexible, arbitrary-scenario, uncertainty-aware variant calling \
50    with parameter free filtration via FDR control for small and structural variants.",
51    setting = structopt::clap::AppSettings::ColoredHelp,
52)]
53pub enum Varlociraptor {
54    #[structopt(
55        name = "preprocess",
56        about = "Preprocess variants",
57        setting = structopt::clap::AppSettings::ColoredHelp,
58    )]
59    Preprocess {
60        #[structopt(subcommand)]
61        kind: PreprocessKind,
62    },
63    #[structopt(
64        name = "call",
65        about = "Call variants.",
66        setting = structopt::clap::AppSettings::ColoredHelp,
67    )]
68    Call {
69        #[structopt(subcommand)]
70        kind: CallKind,
71    },
72    #[structopt(
73        name = "filter-calls",
74        about = "Filter calls by either controlling the false discovery rate (FDR) at given level, or by posterior odds against the given events.",
75        setting = structopt::clap::AppSettings::ColoredHelp,
76    )]
77    FilterCalls {
78        #[structopt(subcommand)]
79        method: FilterMethod,
80    },
81    #[structopt(
82        name = "decode-phred",
83        about = "Decode PHRED-scaled values to human readable probabilities.",
84        usage = "varlociraptor decode-phred < in.bcf > out.bcf",
85        setting = structopt::clap::AppSettings::ColoredHelp,
86    )]
87    DecodePHRED,
88    #[structopt(
89        name = "estimate",
90        about = "Perform estimations.",
91        setting = structopt::clap::AppSettings::ColoredHelp,
92    )]
93    Estimate {
94        #[structopt(subcommand)]
95        kind: EstimateKind,
96    },
97    #[structopt(
98        name = "plot",
99        about = "Create plots",
100        setting = structopt::clap::AppSettings::ColoredHelp,
101    )]
102    Plot {
103        #[structopt(subcommand)]
104        kind: PlotKind,
105    },
106    #[structopt(
107        name = "genotype",
108        about = "Infer classical genotypes from Varlociraptor's AF field (1.0: 1/1, 0.5: 0/1, 0.0: 0/0, otherwise: 0/1). This assumes diploid samples.",
109        usage = "varlociraptor genotype < in.bcf > out.bcf",
110        setting = structopt::clap::AppSettings::ColoredHelp,
111    )]
112    Genotype,
113    #[structopt(
114        name = "methylation-candidates",
115        about = "Generate BCF with methylation candidates",
116        usage = "varlociraptor methylation-candidates input.fasta output.bcf",
117        setting = structopt::clap::AppSettings::ColoredHelp,
118    )]
119    MethylationCandidates {
120        #[structopt(
121            name = "input",
122            parse(from_os_str),
123            required = true,
124            help = "Input FASTA File"
125        )]
126        input: PathBuf,
127        #[structopt(
128            long = "motifs",
129            help = "Comma-separated list of methylation motifs to search for in the input chromosome. Supported motifs: CG, CHG, CHH, GATC.",
130            required = false,
131            use_delimiter = true
132        )]
133        #[serde(default = "default_methylation_motifs")]
134        motifs: Vec<MethylationMotif>,
135        #[structopt(name = "output", parse(from_os_str), help = "Output BCF File")]
136        output: Option<PathBuf>,
137    },
138}
139
140pub struct PreprocessInput {
141    reference: PathBuf,
142    bam: PathBuf,
143}
144
145impl Varlociraptor {
146    fn preprocess_input(&self) -> PreprocessInput {
147        if let Varlociraptor::Preprocess {
148            kind:
149                PreprocessKind::Variants {
150                    ref reference,
151                    ref bam,
152                    ..
153                },
154        } = &self
155        {
156            PreprocessInput {
157                reference: reference.to_owned(),
158                bam: bam.to_owned(),
159            }
160        } else {
161            panic!("bug: these are not preprocess options.");
162        }
163    }
164}
165
166impl FromStr for MethylationMotif {
167    type Err = String;
168
169    fn from_str(s: &str) -> Result<Self, Self::Err> {
170        match s.to_uppercase().as_str() {
171            "CG" => Ok(MethylationMotif::CG),
172            "CHG" => Ok(MethylationMotif::CHG),
173            "CHH" => Ok(MethylationMotif::CHH),
174            "GATC" => Ok(MethylationMotif::GATC),
175            _ => Err(format!("Invalid methylation motif: {}", s)),
176        }
177    }
178}
179
180fn default_methylation_motifs() -> Vec<MethylationMotif> {
181    vec![MethylationMotif::CG]
182}
183
184fn default_reference_buffer_size() -> usize {
185    10
186}
187
188fn default_pairhmm_mode() -> String {
189    "exact".to_owned()
190}
191
192fn default_log_mode() -> String {
193    "default".to_owned()
194}
195
196fn default_min_bam_refetch_distance() -> u64 {
197    1
198}
199
200#[derive(Debug, StructOpt, Serialize, Deserialize, Clone)]
201pub enum PreprocessKind {
202    #[structopt(
203        name = "variants",
204        about = "Preprocess given variants by obtaining internal observations (allele likelihoods, strand information, ...)\
205                 for each fragment. \
206                 The obtained observations are printed to STDOUT in BCF format. Note that the resulting BCFs \
207                 will be very large and are only intended for internal use (e.g. for piping into 'varlociraptor \
208                 call variants generic').",
209        usage = "varlociraptor preprocess variants reference.fasta --alignment-properties alignment-properties.json \
210                 --candidates candidates.bcf --bam sample.bam > sample.observations.bcf",
211        setting = structopt::clap::AppSettings::ColoredHelp,
212    )]
213    Variants {
214        #[structopt(
215            parse(from_os_str),
216            help = "FASTA file with reference genome. Has to be indexed with (e.g. with samtools faidx)."
217        )]
218        reference: PathBuf,
219        #[structopt(
220            parse(from_os_str),
221            long,
222            required = true,
223            help = "sorted VCF/BCF file to process (if omitted, read from STDIN)."
224        )]
225        candidates: PathBuf,
226        #[structopt(
227            long,
228            required = true,
229            help = "BAM file with aligned reads from a single sample. BAM file must be indexed (e.g. with samtools index)."
230        )]
231        bam: PathBuf,
232        #[structopt(
233            long,
234            help = "Name of INFO field containing an expected heterozygosity per variant allele (can be e.g. a population allele frequency). Field needs to have as many entries as ALT alleles per VCF/BCF record."
235        )]
236        variant_heterozygosity_field: Option<String>,
237        #[structopt(
238            long,
239            help = "Name of INFO field containing an expected somatic effective mutation rate per variant allele (can be e.g. a poplulation scale prevalence of a known cancer variant from a database). Field needs to have as many entries as ALT alleles per VCF/BCF record."
240        )]
241        variant_somatic_effective_mutation_rate_field: Option<String>,
242        #[structopt(
243            long,
244            help = "Report fragment IDs in output BCF. This information can be used for phasing."
245        )]
246        #[serde(default)]
247        report_fragment_ids: bool,
248        #[structopt(
249            long,
250            help = "Assume that candidate variants are given in atomic form (unlike e.g. \
251            provided by freebayes which combines adjacent variants that appear to be in \
252            phase into longer haplotypes). If variants are atomic, we do not want to perform \
253            realignment against SNVs and MNVs because those can lead to false positives if \
254            they are immediately followed or preceeded by indels."
255        )]
256        #[serde(default)]
257        atomic_candidate_variants: bool,
258        #[structopt(
259            long,
260            help = "Do not adjust mapping quality (MAPQ). By default Varlociraptor will adjust mapping qualities \
261            in order to avoid false positive hits caused by inflated MAPQ values at ambiguous loci. \
262            This happens by conservatively averaging MAPQs of all reads that overlap a given locus. \
263            While this is usually a good idea and has been validated by extensive benchmarking, there can be cases \
264            where this is not desired, e.g. when solely evaluating known variants."
265        )]
266        #[serde(default)]
267        omit_mapq_adjustment: bool,
268        #[structopt(
269            long = "reference-buffer-size",
270            short = "b",
271            default_value = "10",
272            help = "Number of reference sequences to keep in buffer. Use a smaller value \
273                    to save memory at the expense of sometimes reduced parallelization."
274        )]
275        #[serde(default = "default_reference_buffer_size")]
276        reference_buffer_size: usize,
277        #[structopt(
278            long = "min-bam-refetch-distance",
279            default_value = "1",
280            help = "Base pair distance to last fetched BAM interval such that a refetching is performed \
281                  instead of reading through until the next interval is reached. Making this too small \
282                  can cause unnecessary random access. Making this too large can lead to unneccessary \
283                  iteration over irrelevant records. Benchmarking has shown that at least for short reads, \
284                  a value of 1 (e.g. always refetch) does not incur additional costs and is a reasonable \
285                  default."
286        )]
287        #[serde(default = "default_min_bam_refetch_distance")]
288        min_bam_refetch_distance: u64,
289        #[structopt(
290            long = "alignment-properties",
291            help = "Alignment properties JSON file for sample. If not provided, properties \
292                    will be estimated from the given BAM file. It is recommended to estimate alignment \
293                    properties separately, see 'varlociraptor estimate alignment-properties --help'."
294        )]
295        alignment_properties: Option<PathBuf>,
296        #[structopt(
297            parse(from_os_str),
298            long,
299            help = "BCF file that shall contain the results (if omitted, write to STDOUT)."
300        )]
301        output: Option<PathBuf>,
302        #[structopt(
303            long = "propagate-info-fields",
304            help = "Additional INFO fields in the input BCF that shall be propagated to the output BCF \
305            (not supported for variants connected via EVENT tags except breakends)."
306        )]
307        #[serde(default)]
308        propagate_info_fields: Vec<String>,
309        #[structopt(
310            long = "indel-window",
311            default_value = "64",
312            help = "Number of bases to consider left and right of breakpoint when \
313                    calculating read support. Currently implemented maximum \
314                    value is 64."
315        )]
316        realignment_window: u64,
317        #[structopt(
318            long = "max-depth",
319            default_value = "200",
320            help = "Maximum number of observations to use for calling. If locus is exceeding this \
321                    number, downsampling is performed."
322        )]
323        max_depth: usize,
324        #[structopt(
325            long = "omit-insert-size",
326            help = "Do not consider insert size when calculating support for a variant. Use this flag when \
327                    processing amplicon data, where indels do not impact the observed insert size"
328        )]
329        #[serde(default)]
330        omit_insert_size: bool,
331        #[structopt(
332            long = "pairhmm-mode",
333            possible_values = &["fast", "exact", "homopolymer"],
334            default_value = "exact",
335            help = "PairHMM computation mode (either fast, exact or homopolymer). Fast mode means that only the best \
336                    alignment path is considered for probability calculation. In rare cases, this can lead \
337                    to wrong results for single reads. Hence, we advice to not use it when \
338                    discrete allele frequences are of interest (0.5, 1.0). For continuous \
339                    allele frequencies, fast mode should cause almost no deviations from the \
340                    exact results. Also, if per sample allele frequencies are irrelevant (e.g. \
341                    in large cohorts), fast mode can be safely used. \
342                    Note that fast and exact mode are not suitable for ONT data.\
343                    The homopolymer mode should be used for ONT data; it is similar to the exact mode \
344                    but considers homopolymer errors to be different from gaps."
345        )]
346        #[serde(default = "default_pairhmm_mode")]
347        pairhmm_mode: String,
348        #[structopt(
349            long = "log-mode",
350            possible_values = &["default", "each-record"],
351            default_value = "default",
352            help = "Specify how progress should be logged. By default, a record count will be printed. With 'each-record', \
353            Varlociraptor will additionally print contig and position of each processed candidate variant record. This is \
354            useful for debugging."
355        )]
356        #[serde(default = "default_log_mode")]
357        log_mode: String,
358        #[structopt(
359            parse(from_os_str),
360            long = "output-raw-observations",
361            help = "Output raw observations as TSV files at the given prefix. Attention, only use this for debugging when processing \
362            a single variant. Otherwise it will cause a lots of files and a significant performance hit."
363        )]
364        #[serde(default)]
365        output_raw_observations: Option<PathBuf>,
366        #[structopt(
367            long = "methylation-read-type",
368            possible_values = &MethylationReadtype::iter().map(|v| v.into()).collect_vec(),
369            help = "Type of methylation information encoded in the reads. Use 'converted' for reads treated with bisulfite or EMSeq. Use 'annotated' for reads where methylation information is encoded in the MM and ML tags."
370        )]
371        methylation_readtype: Option<MethylationReadtype>,
372    },
373}
374
375#[derive(Debug, StructOpt, Serialize, Deserialize, Clone)]
376pub enum PlotKind {
377    #[structopt(
378        name = "variant-calling-prior",
379        about = "Plot variant calling prior given a scenario. Plot is printed to STDOUT in Vega-lite format.",
380        usage = "varlociraptor plot variant-calling-prior --scenario scenario.yaml > plot.vl.json",
381        setting = structopt::clap::AppSettings::ColoredHelp,
382    )]
383    VariantCallingPrior {
384        #[structopt(
385            parse(from_os_str),
386            long = "scenario",
387            required = true,
388            help = "Variant calling scenario that configures the prior."
389        )]
390        scenario: PathBuf,
391        #[structopt(
392            long = "contig",
393            required = true,
394            help = "Contig to consider for ploidy information."
395        )]
396        contig: String,
397        #[structopt(long = "sample", required = true, help = "Sample to plot.")]
398        sample: String,
399    },
400    #[structopt(
401        name = "scatter",
402        about = "Plot variant allelic fraction scatter plot overlayed with a contour plot between two sample groups",
403        usage = "varlociraptor plot scatter --sample-x sample1 \
404        --sample-y sample2 sample3 < calls.bcf | vg2svg > scatter.svg",
405        setting = structopt::clap::AppSettings::ColoredHelp,
406    )]
407    Scatter {
408        #[structopt(
409            long = "sample-x",
410            help = "Name of the first sample in the given VCF/BCF."
411        )]
412        sample_x: String,
413        #[structopt(
414            long = "sample-y",
415            help = "Name(s) of the alternative sample(s) in the given VCF/BCF. Multiple samples can be given."
416        )]
417        sample_y: Vec<String>,
418    },
419}
420
421#[derive(Debug, StructOpt, Serialize, Deserialize, Clone)]
422pub enum EstimateKind {
423    #[structopt(
424        name = "alignment-properties",
425        about = "Estimate properties like insert size, maximum softclip length, and the PCR homopolymer error model.",
426        usage = "varlociraptor estimate alignment-properties reference.fasta --bams sample.bam > sample.alignment-properties.json",
427        setting = structopt::clap::AppSettings::ColoredHelp,
428    )]
429    AlignmentProperties {
430        #[structopt(
431            parse(from_os_str),
432            help = "FASTA file with reference genome. Has to be indexed with samtools faidx."
433        )]
434        reference: PathBuf,
435
436        #[structopt(
437            long,
438            required = true,
439            help = "BAM files with aligned reads from a single sample or a set of \
440            samples that have been sequenced and prepared in exactly the same way. \
441            Use multiple BAM files in order to increase estimation robustness in case \
442            the samples only have comparably few reads (e.g. in case of panel sequencing)."
443        )]
444        bams: Vec<PathBuf>,
445
446        #[structopt(long, help = "Number of records to sample from the BAM file")]
447        num_records: Option<usize>,
448    },
449    #[structopt(
450        name = "contamination",
451        about = "Estimate contamination between samples (still experimental). Takes two samples, the one that \
452        is presumably contaminated and the contaminating one, and calculates the posterior distribution of \
453        the contamination fraction, which is printed into a TSV table. The most likely fraction comes at the top. \
454        Uses a uniform prior distribution by default. Optionally, e.g. a pathologist's estimate can be provided, \
455        such that the calculated posterior distribution can be sharpened by the model with the given prior \
456        knowledge. For now, it is always advisable to check the visual output generated by --output-plot.",
457        usage = "varlociraptor estimate contamination --sample sample-a.bcf --contaminant sample-b.bcf --output-plot contamination-qc-plot.vl.json > contamination.tsv",
458        setting = structopt::clap::AppSettings::ColoredHelp,
459    )]
460    Contamination {
461        #[structopt(long = "sample", help = "Presumably contaminated sample.")]
462        sample: PathBuf,
463        #[structopt(long = "contaminant", help = "Presumably contaminating sample.")]
464        contaminant: PathBuf,
465        #[structopt(
466            long = "prior-estimate",
467            help = "Prior estimate of the contamination (1-purity), e.g. \
468            obtained by counting tumor cells in a histology sample. Has to be used \
469            together with the number of cells considered for the estimate (--prior-considered-cells)."
470        )]
471        prior_estimate: Option<f64>,
472        #[structopt(
473            long = "prior-considered-cells",
474            help = "Number of cells considered for the prior estimate (see --prior-estimate)."
475        )]
476        prior_considered_cells: Option<u32>,
477        #[structopt(
478            long = "output",
479            help = "Output file; if not specified, output is printed to STDOUT."
480        )]
481        output: Option<PathBuf>,
482        #[structopt(
483            long = "output-plot",
484            help = "Path to store vega-lite plot of contamination/purity."
485        )]
486        output_plot: Option<PathBuf>,
487        #[structopt(
488            long = "output-max-vaf-variants",
489            help = "Path to store TSV of variant positions with maximum VAF in the contaminated sample (for QC)."
490        )]
491        output_max_vaf_variants: Option<PathBuf>,
492    },
493    #[structopt(
494        name = "mutational-burden",
495        about = "Estimate mutational burden. Takes Varlociraptor calls (must be annotated \
496                 with e.g. VEP but using ANN instead of CSQ) from STDIN, prints mutational burden estimate in Vega-lite JSON format to STDOUT. \
497                 It can be converted to an image via vega-lite-cli (see conda package).",
498        usage = "varlociraptor estimate mutational-burden --mode curve --coding-genome-size 3e7 --events SOMATIC_TUMOR \
499                 --sample tumor < calls.bcf | vg2svg > tmb.svg\n    \
500                 varlociraptor estimate mutational-burden --mode table --coding-genome-size 3e7 --events SOMATIC_TUMOR \
501                 --sample tumor < calls.bcf > tmb.tsv",
502        setting = structopt::clap::AppSettings::ColoredHelp,
503    )]
504    MutationalBurden {
505        #[structopt(long = "events", help = "Events to consider (e.g. SOMATIC_TUMOR).")]
506        events: Vec<String>,
507        #[structopt(
508            long = "sample",
509            help = "Name(s) of the sample(s) in the given VCF/BCF. Multiple samples can be given when using the multibar plot mode."
510        )]
511        sample: Vec<String>,
512        #[structopt(
513            long = "coding-genome-size",
514            default_value = "3e7",
515            help = "Size (in bases) of the covered coding genome."
516        )]
517        coding_genome_size: f64,
518        #[structopt(
519            long = "mode",
520            possible_values = &estimation::mutational_burden::Mode::iter().map(|v| v.into()).collect_vec(),
521            help = "How to output to STDOUT (as stratified curve, histogram, or multi-sample barplot, or TSV table)."
522        )]
523        mode: estimation::mutational_burden::Mode,
524        #[structopt(
525            long = "vaf-cutoff",
526            default_value = "0.2",
527            help = "Minimal variant allelic fraction to consider for mutli-sample barplot"
528        )]
529        cutoff: f64,
530    },
531}
532
533#[derive(Debug, StructOpt, Serialize, Deserialize, Clone)]
534pub enum CallKind {
535    #[structopt(
536        name = "variants",
537        about = "Call variants.",
538        setting = structopt::clap::AppSettings::ColoredHelp,
539    )]
540    Variants {
541        #[structopt(subcommand)]
542        mode: VariantCallMode,
543        #[structopt(
544            long = "propagate-info-fields",
545            help = "Additional INFO fields in the input BCF that shall be propagated to the output BCF."
546        )]
547        propagate_info_fields: Vec<String>,
548        #[structopt(
549            long = "omit-strand-bias",
550            help = "Do not consider strand bias when calculating the probability of an artifact.\
551                    Use this flag when processing (panel) sequencing data, where the wet-lab \
552                    methodology leads to strand bias in the coverage of genuine variants."
553        )]
554        #[serde(default)]
555        omit_strand_bias: bool,
556        #[structopt(
557            long = "omit-read-orientation-bias",
558            help = "Do not consider read orientation bias when calculating the probability of an \
559                    artifact."
560        )]
561        #[serde(default)]
562        omit_read_orientation_bias: bool,
563        #[structopt(
564            long = "omit-read-position-bias",
565            help = "Do not consider read position bias when calculating the probability of an \
566                    artifact. Use this flag when processing (panel) sequencing data, where the \
567                    wet-lab methodology leads to stacks of reads starting at the same position."
568        )]
569        #[serde(default)]
570        omit_read_position_bias: bool,
571        #[structopt(
572            long = "omit-softclip-bias",
573            help = "Do not consider softclip bias when calculating the probability of an \
574                    artifact. Use this flag when processing (panel) sequencing data, where the \
575                    wet-lab methodology leads to stacks of reads starting at the same position."
576        )]
577        #[serde(default)]
578        omit_softclip_bias: bool,
579        #[structopt(
580            long = "omit-homopolymer-artifact-detection",
581            help = "Do not perform PCR homopolymer artifact detection when calculating the probability of an \
582                    artifact. If you are sure that your protocol did not use any PCR you should use this flag."
583        )]
584        #[serde(default)]
585        omit_homopolymer_artifact_detection: bool,
586        #[structopt(
587            long = "omit-alt-locus-bias",
588            help = "Do not consider alt locus bias when calculating the probability of an artifact. \
589                   Use this flag when you have e.g. prior knowledge about all candidate variants being not \
590                   caused by mapping artifacts."
591        )]
592        #[serde(default)]
593        omit_alt_locus_bias: bool,
594        #[structopt(
595            long = "full-prior",
596            help = "Compute the full prior distribution for any allele frequency combination. \
597                    This is in contrast to the default behavior, where the prior is only used to \
598                    distinguish between absence and presence of variants \
599                    (in essence we calculate the prior probability for the variant to be not present given all \
600                    provided mutation rates and the heterozygosity). \
601                    The advantage of this default behavior is that observations become in any case more important \
602                    than the prior to distinguish between different events, while the prior still effectively protects \
603                    from false positives."
604        )]
605        #[serde(default)]
606        full_prior: bool,
607        #[structopt(
608            long = "testcase-locus",
609            help = "Create a test case for the given locus. Locus must be given in the form \
610                    CHROM:POS. Alternatively, for single variant VCFs, you can \
611                    specify 'all'."
612        )]
613        testcase_locus: Option<String>,
614        #[structopt(
615            long = "testcase-prefix",
616            help = "Create test case files in the given directory."
617        )]
618        testcase_prefix: Option<String>,
619        #[structopt(
620            long = "testcase-anonymous",
621            help = "Anonymize any identifiers (via uuid4) and the sequences (by randomly permuting the alphabet) in the test case."
622        )]
623        testcase_anonymous: bool,
624        #[structopt(
625            long,
626            short,
627            help = "Output variant calls to given path (in BCF format). If omitted, prints calls to STDOUT."
628        )]
629        output: Option<PathBuf>,
630        #[structopt(
631            long = "log-mode",
632            possible_values = &["default", "each-record"],
633            default_value = "default",
634            help = "Specify how progress should be logged. By default, a record count will be printed. With 'each-record', \
635            Varlociraptor will additionally print contig and position of each processed candidate variant record. This is \
636            useful for debugging."
637        )]
638        #[serde(default = "default_log_mode")]
639        log_mode: String,
640    },
641    // #[structopt(
642    //     name = "cnvs",
643    //     about = "Call CNVs in tumor-normal sample pairs. This is experimental (do not use it yet).",
644    //     setting = structopt::clap::AppSettings::ColoredHelp,
645    // )]
646    // CNVs {
647    //     #[structopt(
648    //         parse(from_os_str),
649    //         long,
650    //         help = "VCF/BCF file (generated by varlociraptor call-tumor-normal) to process \
651    //                 (if omitted, read from STDIN)."
652    //     )]
653    //     calls: Option<PathBuf>,
654    //     #[structopt(
655    //         parse(from_os_str),
656    //         long,
657    //         help = "BCF file that shall contain the results (if omitted, write to STDOUT)."
658    //     )]
659    //     output: Option<PathBuf>,
660    //     #[structopt(long, short = "p", help = "Tumor purity.")]
661    //     purity: f64,
662    //     #[structopt(
663    //         long = "min-bayes-factor",
664    //         default_value = "1.01",
665    //         help = "Minimum bayes factor (> 1.0) between likelihoods of CNV and no CNV to consider. \
666    //                 The higher this value, the fewer candidate CNVs will be investigated. \
667    //                 Note that this can be usually left unchanged, because every CNV is provided \
668    //                 with a posterior probability that can be used for filtering, e.g., via \
669    //                 'varlociraptor filter-calls control-fdr'."
670    //     )]
671    //     min_bayes_factor: f64,
672    //     #[structopt(
673    //         long,
674    //         default_value = "1000",
675    //         help = "Maximum distance between supporting loci in a CNV."
676    //     )]
677    //     max_dist: u32,
678    //     #[structopt(long, short = "t", help = "Number of threads to use.")]
679    //     threads: usize,
680    // },
681}
682
683#[derive(Debug, StructOpt, Serialize, Deserialize, Clone)]
684pub enum VariantCallMode {
685    #[structopt(
686        name = "tumor-normal",
687        about = "Call somatic and germline variants from a tumor-normal sample pair and a VCF/BCF with candidate variants.",
688        usage = "varlociraptor call variants tumor-normal --purity 0.75 --tumor tumor.bcf --normal normal.bcf > calls.bcf",
689        setting = structopt::clap::AppSettings::ColoredHelp,
690    )]
691    TumorNormal {
692        #[structopt(
693            parse(from_os_str),
694            long = "tumor",
695            required = true,
696            help = "BCF file with varlociraptor preprocess results for the tumor sample."
697        )]
698        tumor_observations: PathBuf,
699        #[structopt(
700            parse(from_os_str),
701            long = "normal",
702            required = true,
703            help = "BCF file with varlociraptor preprocess results for the normal sample."
704        )]
705        normal_observations: PathBuf,
706        #[structopt(short, long, default_value = "1.0", help = "Purity of tumor sample.")]
707        purity: f64,
708    },
709    #[structopt(
710        name = "generic",
711        about = "Call variants for a given scenario specified with the varlociraptor calling \
712                 grammar and a VCF/BCF with candidate variants.",
713        usage = "varlociraptor call variants --output calls.bcf generic --scenario scenario.yaml \
714                 --obs relapse=relapse.bcf tumor=tumor.bcf normal=normal.bcf",
715        setting = structopt::clap::AppSettings::ColoredHelp,
716    )]
717    Generic {
718        #[structopt(
719            parse(from_os_str),
720            long,
721            required = true,
722            help = "Scenario defined in the varlociraptor calling grammar."
723        )]
724        scenario: PathBuf,
725        #[structopt(
726            long = "obs",
727            required = true,
728            help = "BCF file with varlociraptor preprocess results for samples defined in the given scenario (given as samplename=path/to/calls.bcf). It is possible to omit a sample here (e.g. model tumor/normal in the scenario, but only call on the tumor sample when there is no normal sequenced). In that case, the resulting probabilities will be accordingly uncertain, because observations of the omitted sample are missing (which is equivalent to having no coverage in the sample)."
729        )]
730        sample_observations: Vec<String>,
731    },
732}
733
734#[derive(Debug, Serialize, Deserialize, Clone, EnumString, Display)]
735#[strum(serialize_all = "kebab_case")]
736pub enum ControlFDRMode {
737    LocalSmart,
738    LocalStrict,
739    GlobalSmart,
740    GlobalStrict,
741}
742
743impl ControlFDRMode {
744    pub fn is_local(&self) -> bool {
745        match self {
746            ControlFDRMode::LocalSmart | ControlFDRMode::LocalStrict => true,
747            ControlFDRMode::GlobalSmart | ControlFDRMode::GlobalStrict => false,
748        }
749    }
750
751    pub fn is_smart(&self) -> bool {
752        match self {
753            ControlFDRMode::LocalSmart | ControlFDRMode::GlobalSmart => true,
754            ControlFDRMode::LocalStrict | ControlFDRMode::GlobalStrict => false,
755        }
756    }
757}
758
759#[derive(Debug, StructOpt, Serialize, Deserialize, Clone)]
760pub enum FilterMethod {
761    #[structopt(
762        name = "control-fdr",
763        about = "Filter variant calls by controlling FDR. Filtered calls are printed to STDOUT.",
764        usage = "varlociraptor filter-calls control-fdr calls.bcf --events SOMATIC_TUMOR --fdr 0.05 \
765                 --var SNV > calls.filtered.bcf",
766        setting = structopt::clap::AppSettings::ColoredHelp,
767    )]
768    ControlFDR {
769        #[structopt(parse(from_os_str), help = "BCF file with varlociraptor calls.")]
770        calls: PathBuf,
771        #[structopt(
772            long = "var",
773            possible_values = &VariantType::iter().map(|v| v.into()).collect_vec(),
774            help = "Variant type to consider. When controlling global FDR (not using --local) this should \
775            be used to control FDR for each type separately. Otherwise, less certain variant types will be \
776            underrepresented."
777        )]
778        vartype: Option<VariantType>,
779        #[structopt(long, help = "FDR to control for.")]
780        fdr: f64,
781        #[structopt(
782            long = "mode",
783            help = "Mode of FDR control (recommended: local-smart). Choose between local or global and strict or smart, combined via a hyphen (e.g. local-smart). \
784            Local means that for each record, the posterior of the selected events has to be at least 1-fdr. Global means that the \
785            posterior FDR of all selected records may not exceed the given fdr threshold. Strict means that the given events \
786            are directly used for above calculations. Smart means that FDR is controlled just by the probability of variants \
787            to be not absent or artifacts, and the given events are used to further filter out those where the summed probability \
788            of the particular events is <50%. Smart mode is recommended, as the strict mode can lead to unexpected behavior. \
789            For example: if want to control for somatic variants, but there is a degree of ambiguity between the somatic and some other event, \
790            strict mode will filter such variants, while smart mode will keep them and you will see the ambiguity in the record. \
791            The former behavior is much more intuitive than loosing such variants entirely."
792        )]
793        mode: ControlFDRMode,
794        #[structopt(
795            long,
796            help = "Whether smart mode shall retain artifact calls \
797            instead of taking the sum of artifact probability and \
798            absent probability for \
799            determining whether a variant shall be filtered. Setting this causes \
800            variants that are marked as artifacts to be kept.
801            "
802        )]
803        smart_retain_artifacts: bool,
804        #[structopt(long, help = "Events to consider.")]
805        events: Vec<String>,
806        #[structopt(long, help = "Minimum indel length to consider.")]
807        minlen: Option<u64>,
808        #[structopt(long, help = "Maximum indel length to consider (exclusive).")]
809        maxlen: Option<u64>,
810    },
811    #[structopt(
812        name = "posterior-odds",
813        about = "Filter variant calls by posterior odds of given events against the rest of events. \
814                 Calls are taken from STDIN, filtered calls are printed to STDOUT.",
815        usage = "varlociraptor filter-calls posterior-odds --events SOMATIC_TUMOR --odds strong < calls.bcf",
816        setting = structopt::clap::AppSettings::ColoredHelp,
817    )]
818    PosteriorOdds {
819        #[structopt(
820            long,
821            possible_values = &KassRaftery::iter().map(|v| v.into()).collect_vec(),
822            help = "Kass-Raftery score to filter against."
823        )]
824        odds: KassRaftery,
825        #[structopt(long, help = "Events to consider.")]
826        events: Vec<String>,
827    },
828}
829
830fn parse_key_values(values: &[String]) -> Option<PathMap> {
831    let mut map = HashMap::new();
832    for value in values {
833        let kv = value.split_terminator('=').collect_vec();
834        if kv.len() == 2 {
835            map.insert(kv[0].to_owned(), PathBuf::from(kv[1]));
836        } else {
837            return None;
838        }
839    }
840    Some(map)
841}
842
843impl Default for Varlociraptor {
844    fn default() -> Self {
845        Varlociraptor::from_iter(vec!["--help"])
846    }
847}
848
849pub fn run(opt: Varlociraptor) -> Result<()> {
850    let opt_clone = opt.clone();
851    match opt {
852        Varlociraptor::Preprocess { kind } => {
853            match kind {
854                PreprocessKind::Variants {
855                    reference,
856                    candidates,
857                    bam,
858                    report_fragment_ids,
859                    atomic_candidate_variants,
860                    omit_mapq_adjustment,
861                    alignment_properties,
862                    output,
863                    propagate_info_fields,
864                    realignment_window,
865                    max_depth,
866                    omit_insert_size,
867                    pairhmm_mode,
868                    reference_buffer_size,
869                    min_bam_refetch_distance,
870                    log_mode,
871                    output_raw_observations,
872                    methylation_readtype,
873                    variant_heterozygosity_field,
874                    variant_somatic_effective_mutation_rate_field,
875                } => {
876                    // TODO: handle testcases
877                    if realignment_window > (128 / 2) {
878                        return Err(
879                            structopt::clap::Error::with_description(
880                                "Command-line option --indel-window requires a value <= 64 with the current implementation.",
881                                structopt::clap::ErrorKind::ValueValidation
882                            ).into()
883                        );
884                    };
885
886                    let variant_heterozygosity_field = variant_heterozygosity_field.map(Vec::from);
887                    let variant_somatic_effective_mutation_rate_field =
888                        variant_somatic_effective_mutation_rate_field.map(Vec::from);
889
890                    let mut reference_buffer = Arc::new(
891                        reference::Buffer::from_path(&reference, reference_buffer_size)
892                            .context("Unable to read genome reference.")?,
893                    );
894
895                    let alignment_properties = est_or_load_alignment_properties(
896                        &alignment_properties,
897                        &bam,
898                        omit_insert_size,
899                        Arc::get_mut(&mut reference_buffer).unwrap(),
900                        Some(crate::estimation::alignment_properties::NUM_FRAGMENTS),
901                    )?;
902
903                    let gap_params = alignment_properties.gap_params.clone();
904
905                    let log_each_record = log_mode == "each-record";
906
907                    let propagate_info_fields = propagate_info_fields
908                        .iter()
909                        .map(|s| s.as_bytes().to_owned())
910                        .collect();
911
912                    match pairhmm_mode.as_ref() {
913                        "homopolymer" => {
914                            let hop_params = alignment_properties.hop_params.clone();
915                            let mut processor =
916                                calling::variants::preprocessing::ObservationProcessor::builder()
917                                    .report_fragment_ids(report_fragment_ids)
918                                    .adjust_prob_mapping(!omit_mapq_adjustment)
919                                    .alignment_properties(alignment_properties)
920                                    .max_depth(max_depth)
921                                    .inbam(bam)
922                                    .min_bam_refetch_distance(min_bam_refetch_distance)
923                                    .reference_buffer(Arc::clone(&reference_buffer))
924                                    .haplotype_feature_index(HaplotypeFeatureIndex::new(
925                                        &candidates,
926                                    )?)
927                                    .inbcf(candidates)
928                                    .aux_info_fields(propagate_info_fields)
929                                    .options(opt_clone)
930                                    .outbcf(output)
931                                    .raw_observation_output(output_raw_observations)
932                                    .log_each_record(log_each_record)
933                                    .realigner(realignment::HomopolyPairHMMRealigner::new(
934                                        reference_buffer,
935                                        gap_params,
936                                        hop_params,
937                                        realignment_window,
938                                    ))
939                                    .atomic_candidate_variants(atomic_candidate_variants)
940                                    .methylation_readtype(methylation_readtype)
941                                    .variant_heterozygosity_field(variant_heterozygosity_field)
942                                    .variant_somatic_effective_mutation_rate_field(
943                                        variant_somatic_effective_mutation_rate_field,
944                                    )
945                                    .build();
946                            processor.process()?;
947                        }
948                        "fast" => {
949                            let mut processor =
950                                calling::variants::preprocessing::ObservationProcessor::builder()
951                                    .report_fragment_ids(report_fragment_ids)
952                                    .adjust_prob_mapping(!omit_mapq_adjustment)
953                                    .alignment_properties(alignment_properties)
954                                    .max_depth(max_depth)
955                                    .inbam(bam)
956                                    .min_bam_refetch_distance(min_bam_refetch_distance)
957                                    .reference_buffer(Arc::clone(&reference_buffer))
958                                    .haplotype_feature_index(HaplotypeFeatureIndex::new(
959                                        &candidates,
960                                    )?)
961                                    .inbcf(candidates)
962                                    .aux_info_fields(propagate_info_fields)
963                                    .options(opt_clone)
964                                    .outbcf(output)
965                                    .raw_observation_output(output_raw_observations)
966                                    .log_each_record(log_each_record)
967                                    .realigner(realignment::PathHMMRealigner::new(
968                                        gap_params,
969                                        realignment_window,
970                                        reference_buffer,
971                                    ))
972                                    .atomic_candidate_variants(atomic_candidate_variants)
973                                    .methylation_readtype(methylation_readtype)
974                                    .variant_heterozygosity_field(variant_heterozygosity_field)
975                                    .variant_somatic_effective_mutation_rate_field(
976                                        variant_somatic_effective_mutation_rate_field,
977                                    )
978                                    .build();
979                            processor.process()?;
980                        }
981                        "exact" => {
982                            if Prob::from(gap_params.prob_deletion_extend_artifact) > Prob(0.3)
983                                || Prob::from(gap_params.prob_insertion_extend_artifact) > Prob(0.3)
984                            {
985                                warn!(
986                                    "Unexpectedly high prob_deletion_extend_artifact or \
987                                    prob_insertion_extend_artifact in alignment properties (>30%). \
988                                    Consider estimating alignment properties with multiple samples \
989                                    in case you have only few reads."
990                                );
991                            }
992                            let mut processor =
993                                calling::variants::preprocessing::ObservationProcessor::builder()
994                                    .report_fragment_ids(report_fragment_ids)
995                                    .adjust_prob_mapping(!omit_mapq_adjustment)
996                                    .alignment_properties(alignment_properties)
997                                    .max_depth(max_depth)
998                                    .inbam(bam)
999                                    .min_bam_refetch_distance(min_bam_refetch_distance)
1000                                    .reference_buffer(Arc::clone(&reference_buffer))
1001                                    .haplotype_feature_index(HaplotypeFeatureIndex::new(
1002                                        &candidates,
1003                                    )?)
1004                                    .inbcf(candidates)
1005                                    .aux_info_fields(propagate_info_fields)
1006                                    .options(opt_clone)
1007                                    .outbcf(output)
1008                                    .raw_observation_output(output_raw_observations)
1009                                    .log_each_record(log_each_record)
1010                                    .realigner(realignment::PairHMMRealigner::new(
1011                                        reference_buffer,
1012                                        gap_params,
1013                                        realignment_window,
1014                                    ))
1015                                    .atomic_candidate_variants(atomic_candidate_variants)
1016                                    .methylation_readtype(methylation_readtype)
1017                                    .variant_heterozygosity_field(variant_heterozygosity_field)
1018                                    .variant_somatic_effective_mutation_rate_field(
1019                                        variant_somatic_effective_mutation_rate_field,
1020                                    )
1021                                    .build();
1022                            processor.process()?;
1023                        }
1024                        _ => panic!("Unknown pairhmm mode '{}'", pairhmm_mode),
1025                    };
1026                }
1027            }
1028        }
1029        Varlociraptor::Call { kind } => {
1030            match kind {
1031                CallKind::Variants {
1032                    mode,
1033                    omit_strand_bias,
1034                    omit_read_orientation_bias,
1035                    omit_read_position_bias,
1036                    omit_softclip_bias,
1037                    omit_homopolymer_artifact_detection,
1038                    omit_alt_locus_bias,
1039                    testcase_locus,
1040                    testcase_prefix,
1041                    testcase_anonymous,
1042                    output,
1043                    log_mode,
1044                    propagate_info_fields,
1045                    full_prior,
1046                } => {
1047                    let testcase_builder = if let Some(testcase_locus) = testcase_locus {
1048                        if let Some(testcase_prefix) = testcase_prefix {
1049                            // TODO obtain sample information from input bcfs?
1050                            Some(
1051                                testcase::builder::TestcaseBuilder::default()
1052                                    .prefix(PathBuf::from(testcase_prefix))
1053                                    .anonymize(testcase_anonymous)
1054                                    .locus(&testcase_locus)?,
1055                            )
1056                        } else {
1057                            return Err(errors::Error::MissingPrefix.into());
1058                        }
1059                    } else {
1060                        None
1061                    };
1062
1063                    let log_each_record = log_mode == "each-record";
1064
1065                    match mode {
1066                        VariantCallMode::Generic {
1067                            scenario,
1068                            sample_observations,
1069                        } => {
1070                            if let Some(sample_observations) =
1071                                parse_key_values(&sample_observations)
1072                            {
1073                                if let Some(mut testcase_builder) = testcase_builder {
1074                                    for (i, (sample_name, obspath)) in
1075                                        sample_observations.iter().enumerate()
1076                                    {
1077                                        let options = calling::variants::preprocessing::read_preprocess_options(obspath)?;
1078                                        let preprocess_input = options.preprocess_input();
1079                                        testcase_builder = testcase_builder.register_sample(
1080                                            sample_name,
1081                                            preprocess_input.bam,
1082                                            options,
1083                                        )?;
1084                                        if i == 0 {
1085                                            testcase_builder =
1086                                                testcase_builder.candidates(obspath.to_owned());
1087                                            testcase_builder = testcase_builder
1088                                                .reference(preprocess_input.reference)?;
1089                                        }
1090                                    }
1091
1092                                    let mut testcase = testcase_builder
1093                                        .scenario(Some(scenario))
1094                                        .mode(testcase::builder::Mode::Generic)
1095                                        .build()
1096                                        .unwrap();
1097                                    info!("Writing testcase.");
1098                                    testcase.write()?;
1099                                    return Ok(());
1100                                }
1101
1102                                let scenario = grammar::Scenario::from_path(scenario)?;
1103
1104                                call_generic(
1105                                    scenario,
1106                                    sample_observations,
1107                                    omit_strand_bias,
1108                                    omit_read_orientation_bias,
1109                                    omit_read_position_bias,
1110                                    omit_softclip_bias,
1111                                    omit_homopolymer_artifact_detection,
1112                                    omit_alt_locus_bias,
1113                                    output,
1114                                    log_each_record,
1115                                    CallWriter::new(),
1116                                    DefaultCandidateFilter::new(),
1117                                    propagate_info_fields,
1118                                    full_prior,
1119                                )?;
1120                            } else {
1121                                return Err(errors::Error::InvalidObservationsSpec.into());
1122                            }
1123                        }
1124                        VariantCallMode::TumorNormal {
1125                            tumor_observations,
1126                            normal_observations,
1127                            purity,
1128                        } => {
1129                            if let Some(testcase_builder) = testcase_builder {
1130                                let tumor_options =
1131                                    calling::variants::preprocessing::read_preprocess_options(
1132                                        &tumor_observations,
1133                                    )?;
1134                                let normal_options =
1135                                    calling::variants::preprocessing::read_preprocess_options(
1136                                        &normal_observations,
1137                                    )?;
1138                                let mut testcase = testcase_builder
1139                                    .candidates(tumor_observations)
1140                                    .reference(tumor_options.preprocess_input().reference)?
1141                                    .register_sample(
1142                                        "tumor",
1143                                        tumor_options.preprocess_input().bam,
1144                                        tumor_options,
1145                                    )?
1146                                    .register_sample(
1147                                        "normal",
1148                                        normal_options.preprocess_input().bam,
1149                                        normal_options,
1150                                    )?
1151                                    .scenario(None)
1152                                    .mode(testcase::builder::Mode::TumorNormal)
1153                                    .purity(Some(purity))
1154                                    .build()
1155                                    .unwrap();
1156
1157                                testcase.write()?;
1158                                return Ok(());
1159                            }
1160
1161                            let scenario = grammar::Scenario::try_from(
1162                                format!(
1163                                    r#"
1164                            samples:
1165                              tumor:
1166                                resolution: 0.01
1167                                contamination:
1168                                  by: normal
1169                                  fraction: {impurity}
1170                                universe: "[0.0,1.0]"
1171                              normal:
1172                                resolution: 0.1
1173                                universe: "[0.0,0.5[ | 0.5 | 1.0"
1174                            events:
1175                              somatic_tumor:  "tumor:]0.0,1.0] & normal:0.0"
1176                              somatic_normal: "tumor:]0.0,1.0] & normal:]0.0,0.5["
1177                              germline_het:   "tumor:]0.0,1.0] & normal:0.5"
1178                              germline_hom:   "tumor:]0.0,1.0] & normal:1.0"
1179                            "#,
1180                                    impurity = 1.0 - purity
1181                                )
1182                                .as_str(),
1183                            )?;
1184
1185                            let mut observations = PathMap::default();
1186                            observations.insert("tumor".to_owned(), tumor_observations);
1187                            observations.insert("normal".to_owned(), normal_observations);
1188
1189                            call_generic(
1190                                scenario,
1191                                observations,
1192                                omit_strand_bias,
1193                                omit_read_orientation_bias,
1194                                omit_read_position_bias,
1195                                omit_softclip_bias,
1196                                omit_homopolymer_artifact_detection,
1197                                omit_alt_locus_bias,
1198                                output,
1199                                log_each_record,
1200                                CallWriter::new(),
1201                                DefaultCandidateFilter::new(),
1202                                propagate_info_fields,
1203                                full_prior,
1204                            )?;
1205                        }
1206                    }
1207                } // CallKind::CNVs {
1208                  //     calls,
1209                  //     output,
1210                  //     min_bayes_factor,
1211                  //     threads,
1212                  //     purity,
1213                  //     max_dist,
1214                  // } => {
1215                  //     rayon::ThreadPoolBuilder::new()
1216                  //         .num_threads(threads)
1217                  //         .build_global()?;
1218
1219                  //     if min_bayes_factor <= 1.0 {
1220                  //         Err(errors::Error::InvalidMinBayesFactor)?
1221                  //     }
1222
1223                  //     let mut caller = calling::cnvs::CallerBuilder::default()
1224                  //         .bcfs(calls.as_ref(), output.as_ref())?
1225                  //         .min_bayes_factor(min_bayes_factor)
1226                  //         .purity(purity)
1227                  //         .max_dist(max_dist)
1228                  //         .build()
1229                  //         .unwrap();
1230                  //     caller.call()?;
1231                  // }
1232            }
1233        }
1234        Varlociraptor::FilterCalls { method } => match method {
1235            FilterMethod::ControlFDR {
1236                calls,
1237                events,
1238                fdr,
1239                mode,
1240                vartype,
1241                minlen,
1242                maxlen,
1243                smart_retain_artifacts,
1244            } => {
1245                let events = events
1246                    .iter()
1247                    .map(|event| SimpleEvent::new(event))
1248                    .collect_vec();
1249                let vartype = match (vartype, minlen, maxlen) {
1250                    (Some(VariantType::Insertion(None)), Some(minlen), Some(maxlen)) => {
1251                        Some(VariantType::Insertion(Some(minlen..maxlen)))
1252                    }
1253                    (Some(VariantType::Deletion(None)), Some(minlen), Some(maxlen)) => {
1254                        Some(VariantType::Deletion(Some(minlen..maxlen)))
1255                    }
1256                    (vartype, _, _) => vartype,
1257                };
1258
1259                let local = mode.is_local();
1260                let smart = mode.is_smart();
1261
1262                filtration::fdr::control_fdr::<&PathBuf, &str>(
1263                    &calls,
1264                    None,
1265                    &events,
1266                    vartype.as_ref(),
1267                    LogProb::from(Prob::checked(fdr)?),
1268                    local,
1269                    smart,
1270                    smart_retain_artifacts,
1271                )?;
1272            }
1273            FilterMethod::PosteriorOdds { ref events, odds } => {
1274                let events = events
1275                    .iter()
1276                    .map(|event| SimpleEvent {
1277                        name: event.to_owned(),
1278                    })
1279                    .collect_vec();
1280
1281                filtration::posterior_odds::filter_by_odds::<_, &PathBuf, &PathBuf>(
1282                    None, None, &events, odds,
1283                )?;
1284            }
1285        },
1286        Varlociraptor::DecodePHRED => {
1287            conversion::decode_phred::decode_phred()?;
1288        }
1289        Varlociraptor::Genotype => {
1290            conversion::genotype::genotype()?;
1291        }
1292        Varlociraptor::Estimate { kind } => match kind {
1293            EstimateKind::Contamination {
1294                sample,
1295                contaminant,
1296                prior_estimate,
1297                prior_considered_cells,
1298                output,
1299                output_plot,
1300                output_max_vaf_variants,
1301            } => {
1302                let prior_estimate = match (prior_estimate, prior_considered_cells) {
1303                    (Some(p), Some(n)) if n > 0 => Some(
1304                        estimation::contamination::PriorEstimate::new(AlleleFreq(p), n),
1305                    ),
1306                    (None, None) => None,
1307                    _ => bail!(errors::Error::InvalidPriorContaminationEstimate),
1308                };
1309                estimation::contamination::estimate_contamination(
1310                    sample,
1311                    contaminant,
1312                    output,
1313                    output_plot,
1314                    output_max_vaf_variants,
1315                    prior_estimate,
1316                )?;
1317            }
1318            EstimateKind::MutationalBurden {
1319                events,
1320                sample,
1321                coding_genome_size,
1322                mode,
1323                cutoff,
1324            } => estimation::mutational_burden::collect_estimates(
1325                &events,
1326                &sample,
1327                coding_genome_size as u64,
1328                mode,
1329                cutoff,
1330            )?,
1331            EstimateKind::AlignmentProperties {
1332                reference,
1333                bams,
1334                num_records,
1335            } => {
1336                let mut reference_buffer = reference::Buffer::from_path(&reference, 1)?;
1337                let alignment_properties = estimate_alignment_properties(
1338                    &bams,
1339                    false,
1340                    &mut reference_buffer,
1341                    num_records,
1342                )?;
1343                println!("{}", serde_json::to_string_pretty(&alignment_properties)?);
1344            }
1345        },
1346        Varlociraptor::Plot { kind } => match kind {
1347            PlotKind::VariantCallingPrior {
1348                scenario,
1349                contig,
1350                sample,
1351            } => {
1352                let scenario = grammar::Scenario::from_path(scenario)?;
1353                let sample_infos = SampleInfos::try_from(&scenario)?;
1354
1355                let mut universes = scenario.sample_info();
1356                let mut ploidies = scenario.sample_info();
1357                for (sample_name, sample) in scenario.samples().iter() {
1358                    universes = universes.push(
1359                        sample_name,
1360                        sample.contig_universe(&contig, scenario.species())?,
1361                    );
1362                    ploidies = ploidies.push(
1363                        sample_name,
1364                        sample.contig_ploidy(&contig, scenario.species())?,
1365                    );
1366                }
1367                let universes = universes.build();
1368                let ploidies = ploidies.build();
1369
1370                let prior = Prior::builder()
1371                    .variant_type_fractions(scenario.variant_type_fractions())
1372                    .ploidies(Some(ploidies))
1373                    .universe(Some(universes))
1374                    .uniform(sample_infos.uniform_prior().clone())
1375                    .germline_mutation_rate(sample_infos.germline_mutation_rates().clone())
1376                    .somatic_effective_mutation_rate(
1377                        sample_infos.somatic_effective_mutation_rates().clone(),
1378                    )
1379                    .inheritance(sample_infos.inheritance().clone())
1380                    .heterozygosity(scenario.species().as_ref().and_then(|species| {
1381                        species.heterozygosity().map(|het| LogProb::from(Prob(het)))
1382                    }))
1383                    .variant_type(Some(VariantType::Snv))
1384                    .is_absent_only(false) // TODO make configurable with this as the default
1385                    .build();
1386                prior.check()?;
1387
1388                prior.plot(&sample, sample_infos.names())?;
1389            }
1390            PlotKind::Scatter { sample_x, sample_y } => {
1391                estimation::sample_variants::vaf_scatter(&sample_x, &sample_y)?
1392            }
1393        },
1394        Varlociraptor::MethylationCandidates {
1395            input,
1396            motifs,
1397            output,
1398        } => {
1399            candidates::methylation::find_candidates(input, motifs, output)?;
1400        }
1401    }
1402    Ok(())
1403}
1404
1405pub(crate) fn est_or_load_alignment_properties(
1406    alignment_properties_file: &Option<impl AsRef<Path>>,
1407    bam_file: impl AsRef<Path>,
1408    omit_insert_size: bool,
1409    reference_buffer: &mut reference::Buffer,
1410    num_records: Option<usize>,
1411) -> Result<AlignmentProperties> {
1412    if let Some(alignment_properties_file) = alignment_properties_file {
1413        Ok(serde_json::from_reader(File::open(
1414            alignment_properties_file,
1415        )?)?)
1416    } else {
1417        estimate_alignment_properties(&[bam_file], omit_insert_size, reference_buffer, num_records)
1418    }
1419}
varlociraptor/cli.rs

varlociraptor/
cli.rs