varlociraptor/grammar/
mod.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap};
2use std::convert::TryFrom;
3use std::fmt;
4use std::fs::File;
5use std::io::Read;
6use std::ops::{Deref, DerefMut};
7use std::path::Path;
8use std::sync::Mutex;
9
10use anyhow::{Context, Result};
11use vec_map::VecMap;
12
13pub(crate) mod formula;
14pub(crate) mod vaftree;
15
16use crate::errors;
17use crate::grammar::formula::FormulaTerminal;
18pub(crate) use crate::grammar::formula::{Formula, VAFRange, VAFSpectrum, VAFUniverse};
19pub(crate) use crate::grammar::vaftree::VAFTree;
20use crate::variants::model::{AlleleFreq, VariantType};
21use itertools::Itertools;
22use serde::{de, Deserializer};
23
24/// Container for arbitrary sample information.
25/// Use `varlociraptor::grammar::Scenario::sample_info()` to create it.
26#[derive(Clone, Debug)]
27pub(crate) struct SampleInfo<T> {
28    inner: Vec<T>,
29}
30
31impl<T> SampleInfo<T> {
32    /// Map to other value type.
33    pub(crate) fn map<U, F: Fn(&T) -> U>(&self, f: F) -> SampleInfo<U> {
34        SampleInfo {
35            inner: self.inner.iter().map(f).collect(),
36        }
37    }
38
39    pub(crate) fn iter_mut(&mut self) -> impl Iterator<Item = &mut T> {
40        self.inner.iter_mut()
41    }
42}
43
44impl<T> SampleInfo<Option<T>> {
45    pub(crate) fn first_not_none(&self) -> Result<&T> {
46        self.iter_not_none()
47            .next()
48            .ok_or_else(|| errors::Error::EmptyObservations.into())
49    }
50
51    pub(crate) fn first_not_none_mut(&mut self) -> Result<&mut T> {
52        self.iter_not_none_mut()
53            .next()
54            .ok_or_else(|| errors::Error::EmptyObservations.into())
55    }
56
57    pub(crate) fn iter_not_none(&self) -> impl Iterator<Item = &T> {
58        self.inner.iter().filter_map(|item| item.as_ref())
59    }
60
61    pub(crate) fn iter_not_none_mut(&mut self) -> impl Iterator<Item = &mut T> {
62        self.inner.iter_mut().filter_map(|item| item.as_mut())
63    }
64}
65
66impl<T> Default for SampleInfo<T> {
67    fn default() -> Self {
68        SampleInfo {
69            inner: Vec::default(),
70        }
71    }
72}
73
74impl<T> Deref for SampleInfo<T> {
75    type Target = Vec<T>;
76
77    fn deref(&self) -> &Self::Target {
78        &self.inner
79    }
80}
81
82impl<T> DerefMut for SampleInfo<T> {
83    fn deref_mut(&mut self) -> &mut Self::Target {
84        &mut self.inner
85    }
86}
87
88impl<T> From<Vec<T>> for SampleInfo<T> {
89    fn from(vec: Vec<T>) -> Self {
90        SampleInfo { inner: vec }
91    }
92}
93
94/// Builder for `SampleInfo`.
95#[derive(new, Debug)]
96pub(crate) struct SampleInfoBuilder<T> {
97    #[new(default)]
98    inner: VecMap<T>,
99    sample_idx: HashMap<String, usize>,
100}
101
102impl<T> SampleInfoBuilder<T> {
103    pub(crate) fn push(mut self, sample_name: &str, value: T) -> Self {
104        let idx = *self
105            .sample_idx
106            .get(sample_name)
107            .expect("unknown sample name, it does not occur in the scenario");
108        self.inner.insert(idx, value);
109
110        self
111    }
112
113    pub(crate) fn build(self) -> SampleInfo<T> {
114        SampleInfo {
115            inner: self.inner.into_iter().map(|(_, v)| v).collect(),
116        }
117    }
118}
119
120#[derive(Derefable, PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Hash, Deserialize)]
121pub(crate) struct ExpressionIdentifier(#[deref] String);
122
123impl std::fmt::Display for ExpressionIdentifier {
124    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
125        write!(f, "{}", self.0)
126    }
127}
128
129#[derive(Deserialize, Getters)]
130#[get = "pub(crate)"]
131#[serde(deny_unknown_fields)]
132pub(crate) struct Scenario {
133    // map of reusable expressions
134    #[serde(default)]
135    expressions: HashMap<ExpressionIdentifier, Formula>,
136    // map of events
137    events: BTreeMap<String, Formula>,
138    // map of samples
139    samples: BTreeMap<String, Sample>,
140    #[serde(skip)]
141    sample_idx: Mutex<Option<HashMap<String, usize>>>,
142    #[serde(default)]
143    species: Option<Species>,
144}
145
146impl Scenario {
147    pub(crate) fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
148        let mut scenario_content = String::new();
149        File::open(path)?.read_to_string(&mut scenario_content)?;
150
151        let mut scenario: Self = serde_yaml::from_str(&scenario_content)?;
152
153        let mut event_expressions = HashMap::new();
154
155        // register all events as expressions
156        for (name, formula) in scenario.events() {
157            let identifier = ExpressionIdentifier(name.clone());
158            if !scenario.expressions().contains_key(&identifier) {
159                event_expressions.insert(identifier, formula.clone());
160            }
161        }
162        let absent_identifier = ExpressionIdentifier("absent".to_owned());
163        if !scenario.expressions.contains_key(&absent_identifier) {
164            event_expressions.insert(absent_identifier, Formula::absent(&scenario));
165        }
166        scenario.expressions.extend(event_expressions);
167        Ok(scenario)
168    }
169
170    pub(crate) fn variant_type_fractions(&self) -> VariantTypeFraction {
171        self.species()
172            .as_ref()
173            .map_or(VariantTypeFraction::default(), |species| {
174                species.variant_type_fractions().clone()
175            })
176    }
177
178    pub(crate) fn sample_info<T>(&self) -> SampleInfoBuilder<T> {
179        let mut sample_idx = self.sample_idx.lock().unwrap();
180        if sample_idx.is_none() {
181            sample_idx.get_or_insert(
182                self.samples()
183                    .keys()
184                    .enumerate()
185                    .map(|(i, s)| (s.to_owned(), i))
186                    .collect(),
187            );
188        }
189        SampleInfoBuilder::new(sample_idx.as_ref().unwrap().clone())
190    }
191
192    pub(crate) fn idx(&self, sample: &str) -> Option<usize> {
193        let mut sample_idx = self.sample_idx.lock().unwrap();
194        if sample_idx.is_none() {
195            sample_idx.get_or_insert(
196                self.samples()
197                    .keys()
198                    .enumerate()
199                    .map(|(i, s)| (s.to_owned(), i))
200                    .collect(),
201            );
202        }
203        sample_idx.as_ref().unwrap().get(sample).copied()
204    }
205
206    pub(crate) fn vaftrees(&self, contig: &str) -> Result<HashMap<String, VAFTree>> {
207        info!("Preprocessing events for contig {}", contig);
208        let trees = self
209            .events()
210            .iter()
211            .map(|(name, formula)| {
212                let normalized = formula
213                    .normalize(self, contig)
214                    .with_context(|| format!("invalid event definition for {}", name))?;
215                info!("    {}: {}", name, normalized);
216                let vaftree = VAFTree::new(&normalized, self, contig)?;
217                Ok((name.to_owned(), vaftree))
218            })
219            .collect();
220        self.validate(contig)?;
221        trees
222    }
223
224    pub(crate) fn validate(&self, contig: &str) -> Result<()> {
225        let names = self
226            .events()
227            .iter()
228            .filter(|(name, _)| *name != "absent")
229            .map(|(name, formula)| {
230                (
231                    // if `formula.normalize(…)` failed above, we won't get to this line,
232                    // so we might as well unwrap.
233                    formula.normalize(self, contig).map(Formula::from).unwrap(),
234                    name,
235                )
236            })
237            .into_group_map();
238        let mut overlapping = vec![];
239        let events: Vec<_> = names.keys().sorted().cloned().collect();
240        for (e1, e2) in events.iter().tuple_combinations() {
241            // skip comparison of event with itself
242            if e1 == e2 {
243                continue;
244            }
245            let terms = [e1, e2]
246                .iter()
247                .filter(|e| !matches!(e.to_terminal(), Some(FormulaTerminal::False)))
248                .map(|&v| v.clone())
249                .collect_vec();
250
251            // skip if any of the operands is a terminal `False`.
252            if terms.len() != 2 {
253                continue;
254            }
255
256            // TODO make sure the disjunction really is canonical, such that trying to check if it's contained in `events` isn't a game of chance
257            let disjunction =
258                Formula::from(Formula::Disjunction { operands: terms }.normalize(self, contig)?);
259            if events.contains(&disjunction) {
260                overlapping.push((
261                    names[e1].clone(),
262                    names[e2].clone(),
263                    names[&disjunction].clone(),
264                ));
265            }
266        }
267        if !overlapping.is_empty() {
268            return Err(crate::errors::Error::OverlappingEvents {
269                expressions: overlapping
270                    .iter()
271                    .map(|(a1, a2, f)| format!("({:?} | {:?}) = {:?}", a1, a2, f))
272                    .join(", "),
273            }
274            .into());
275        } else {
276            Ok(())
277        }
278    }
279}
280
281impl<'a> TryFrom<&'a str> for Scenario {
282    type Error = serde_yaml::Error;
283
284    fn try_from(yaml: &str) -> Result<Self, Self::Error> {
285        serde_yaml::from_str(yaml)
286    }
287}
288
289#[derive(Deserialize)]
290#[serde(untagged)]
291pub(crate) enum PloidyDefinition {
292    Simple(u32),
293    Map(HashMap<String, u32>),
294}
295
296impl PloidyDefinition {
297    pub(crate) fn contig_ploidy(&self, contig: &str) -> Result<u32> {
298        Ok(match self {
299            PloidyDefinition::Simple(ploidy) => *ploidy,
300            PloidyDefinition::Map(map) => {
301                match map.get(contig) {
302                    Some(ploidy) => *ploidy,
303                    None => map.get("all").copied().ok_or_else(|| {
304                        errors::Error::PloidyContigNotFound {
305                            contig: contig.to_owned(),
306                        }
307                    })?,
308                }
309            }
310        })
311    }
312}
313
314#[derive(Deserialize)]
315#[serde(untagged)]
316pub(crate) enum SexPloidyDefinition {
317    Generic(PloidyDefinition),
318    Specific(HashMap<Sex, PloidyDefinition>),
319}
320
321impl SexPloidyDefinition {
322    pub(crate) fn contig_ploidy(&self, sex: Option<Sex>, contig: &str) -> Result<u32> {
323        match (self, sex) {
324            (SexPloidyDefinition::Generic(p), _) => p.contig_ploidy(contig),
325            (SexPloidyDefinition::Specific(p), Some(s)) => p.get(&s).map_or_else(
326                || {
327                    Err(errors::Error::InvalidPriorConfiguration {
328                        msg: format!("ploidy definition for {} not found", s),
329                    }
330                    .into())
331                },
332                |p| p.contig_ploidy(contig),
333            ),
334            (SexPloidyDefinition::Specific(_), None) => {
335                Err(errors::Error::InvalidPriorConfiguration {
336                    msg: "sex specific ploidy definition found but no sex specified in sample"
337                        .to_owned(),
338                }
339                .into())
340            }
341        }
342    }
343}
344
345#[derive(Deserialize, Getters)]
346#[get = "pub(crate)"]
347#[serde(deny_unknown_fields)]
348pub(crate) struct Species {
349    #[serde(default)]
350    heterozygosity: Option<f64>,
351    #[serde(default, rename = "germline-mutation-rate")]
352    germline_mutation_rate: Option<f64>,
353    #[serde(default, rename = "somatic-effective-mutation-rate")]
354    somatic_effective_mutation_rate: Option<f64>,
355    #[serde(default, rename = "variant-fractions")]
356    variant_type_fractions: VariantTypeFraction,
357    #[serde(default)]
358    ploidy: Option<SexPloidyDefinition>,
359    #[serde(default, rename = "genome-size")]
360    #[allow(dead_code)]
361    // genome size is deprecated but we keep allowing it to not break old scenarios
362    genome_size: Option<f64>,
363}
364
365impl Species {
366    pub(crate) fn contig_ploidy(&self, contig: &str, sex: Option<Sex>) -> Result<Option<u32>> {
367        if let Some(ploidy) = &self.ploidy {
368            Ok(Some(ploidy.contig_ploidy(sex, contig)?))
369        } else {
370            Ok(None)
371        }
372    }
373}
374
375fn default_indel_fraction() -> f64 {
376    0.0125
377}
378
379fn default_mnv_fraction() -> f64 {
380    0.001
381}
382
383fn default_sv_fraction() -> f64 {
384    0.01
385}
386
387/// Mutation rate reduciton factors (relative to SNVs) for variant types.
388/// Defaults:
389/// * mnvs: 0.001 (see https://www.nature.com/articles/s41467-019-12438-5)
390/// * indels: 0.0125 (see https://gatk.broadinstitute.org/hc/en-us/articles/360036826431-HaplotypeCaller, reduction in heterozygosity)
391/// * svs: 0.001 (predicted several hundred times less frequent that SNVs: https://doi.org/10.1038/s41588-018-0107-y)
392#[derive(Deserialize, Getters, Clone, Debug)]
393#[get = "pub(crate)"]
394#[serde(deny_unknown_fields)]
395pub(crate) struct VariantTypeFraction {
396    #[serde(default = "default_indel_fraction")]
397    indel: f64,
398    #[serde(default = "default_mnv_fraction")]
399    mnv: f64,
400    #[serde(default = "default_sv_fraction")]
401    sv: f64,
402}
403
404impl VariantTypeFraction {
405    pub(crate) fn get(&self, variant_type: &VariantType) -> f64 {
406        match variant_type {
407            VariantType::Insertion(_) | VariantType::Deletion(_) | VariantType::Replacement => {
408                self.indel
409            }
410            VariantType::Mnv => self.mnv,
411            VariantType::Inversion | VariantType::Breakend | VariantType::Duplication => self.sv,
412            _ => 1.0,
413        }
414    }
415}
416
417impl Default for VariantTypeFraction {
418    fn default() -> Self {
419        VariantTypeFraction {
420            indel: default_indel_fraction(),
421            mnv: default_mnv_fraction(),
422            sv: default_sv_fraction(),
423        }
424    }
425}
426
427fn default_resolution() -> Resolution {
428    Resolution(AlleleFreq(0.01))
429}
430
431#[derive(Derefable, Debug, Clone)]
432pub(crate) struct Resolution(#[deref] AlleleFreq);
433
434impl<'de> de::Deserialize<'de> for Resolution {
435    fn deserialize<D>(deserializer: D) -> Result<Resolution, D::Error>
436    where
437        D: Deserializer<'de>,
438    {
439        deserializer.deserialize_str(ResolutionVisitor)
440    }
441}
442
443struct ResolutionVisitor;
444
445impl<'de> de::Visitor<'de> for ResolutionVisitor {
446    type Value = Resolution;
447
448    fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
449        formatter.write_str(
450            "an allele frequency resolution given as floating point value between 0.0 and 1.0 (exclusive, e.g. 0.01, 0.1, etc.)",
451        )
452    }
453
454    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
455    where
456        E: de::Error,
457    {
458        if let Ok(vaf) = v.parse::<f64>() {
459            if vaf > 0.0 && vaf < 1.0 {
460                return Ok(Resolution(AlleleFreq(vaf)));
461            }
462        }
463
464        Err(de::Error::invalid_value(
465            serde::de::Unexpected::Other("VAF resolution"),
466            &self,
467        ))
468    }
469}
470
471#[derive(Deserialize, Getters)]
472#[serde(deny_unknown_fields)]
473pub(crate) struct Sample {
474    /// optional contamination
475    #[get = "pub(crate)"]
476    contamination: Option<Contamination>,
477    /// grid point resolution for integration over continuous allele frequency ranges
478    #[serde(default = "default_resolution")]
479    #[get = "pub(crate)"]
480    resolution: Resolution,
481    /// possible VAFs of given sample
482    #[serde(default)]
483    #[get = "pub(crate)"]
484    universe: Option<UniverseDefinition>,
485    #[serde(default, rename = "somatic-effective-mutation-rate")]
486    somatic_effective_mutation_rate: Option<f64>,
487    #[serde(default, rename = "germline-mutation-rate")]
488    germline_mutation_rate: Option<f64>,
489    #[serde(default)]
490    #[get = "pub(crate)"]
491    ploidy: Option<PloidyDefinition>,
492    #[get = "pub(crate)"]
493    inheritance: Option<Inheritance>,
494    #[serde(default)]
495    sex: Option<Sex>,
496}
497
498impl Sample {
499    pub(crate) fn has_uniform_prior(&self) -> bool {
500        self.universe.is_some()
501    }
502
503    pub(crate) fn contig_universe(
504        &self,
505        contig: &str,
506        species: &Option<Species>,
507    ) -> Result<VAFUniverse> {
508        if let Some(universe) = &self.universe {
509            Ok(match universe {
510                UniverseDefinition::Simple(ref universe) => universe.clone(),
511                UniverseDefinition::Map(ref map) => match map.get(contig) {
512                    Some(universe) => universe.clone(),
513                    None => map
514                        .get("all")
515                        .ok_or_else(|| errors::Error::UniverseContigNotFound {
516                            contig: contig.to_owned(),
517                        })?
518                        .clone(),
519                },
520            })
521        } else {
522            let ploidy_derived_spectrum = |ploidy| -> BTreeSet<AlleleFreq> {
523                (0..=ploidy)
524                    .map(|n_alt| {
525                        if ploidy > 0 {
526                            AlleleFreq(n_alt as f64 / ploidy as f64)
527                        } else {
528                            assert_eq!(n_alt, 0);
529                            AlleleFreq(0.0)
530                        }
531                    })
532                    .collect()
533            };
534            Ok(
535                match (
536                    self.contig_ploidy(contig, species)?,
537                    self.somatic_effective_mutation_rate.is_some(),
538                ) {
539                    (Some(ploidy), false) => {
540                        let mut universe = VAFUniverse::default();
541                        universe.insert(VAFSpectrum::Set(ploidy_derived_spectrum(ploidy)));
542                        universe
543                    }
544                    (Some(ploidy), true) => {
545                        let ploidy_spectrum = ploidy_derived_spectrum(ploidy);
546
547                        let mut universe = VAFUniverse::default();
548
549                        let mut last = ploidy_spectrum.iter().next().unwrap();
550                        for vaf in ploidy_spectrum.iter().skip(1) {
551                            universe.insert(VAFSpectrum::Range(VAFRange::builder()
552                                .inner(*last..*vaf)
553                                .left_exclusive(true)
554                                .right_exclusive(true)
555                                .build()
556                            ));
557                            last = vaf;
558                        }
559                        universe.insert(VAFSpectrum::Set(ploidy_spectrum));
560                        universe
561                    }
562                    (None, true) => {
563                        let mut universe = VAFUniverse::default();
564                        universe.insert(VAFSpectrum::Range(VAFRange::builder()
565                            .inner(AlleleFreq(0.0)..AlleleFreq(1.0))
566                            .left_exclusive(false)
567                            .right_exclusive(false)
568                            .build()
569                        ));
570                        universe
571                    }
572                    (None, false) => return Err(errors::Error::InvalidPriorConfiguration {
573                        msg: "sample needs to define either universe, ploidy or somatic_mutation_rate".to_owned(),
574                    }
575                        .into()),
576                },
577            )
578        }
579    }
580
581    pub(crate) fn contig_ploidy(
582        &self,
583        contig: &str,
584        species: &Option<Species>,
585    ) -> Result<Option<u32>> {
586        if let Some(ploidy) = &self.ploidy {
587            Ok(Some(ploidy.contig_ploidy(contig)?))
588        } else {
589            species.as_ref().map_or(Ok(None), |species| {
590                species.contig_ploidy(contig, self.sex.clone())
591            })
592        }
593    }
594
595    pub(crate) fn germline_mutation_rate(&self, species: &Option<Species>) -> Option<f64> {
596        if let Some(rate) = self.germline_mutation_rate {
597            Some(rate)
598        } else {
599            species
600                .as_ref()
601                .and_then(|species| species.germline_mutation_rate)
602        }
603    }
604
605    pub(crate) fn somatic_effective_mutation_rate(&self, species: &Option<Species>) -> Option<f64> {
606        if let Some(rate) = self.somatic_effective_mutation_rate {
607            Some(rate)
608        } else {
609            species
610                .as_ref()
611                .and_then(|species| species.somatic_effective_mutation_rate)
612        }
613    }
614}
615
616#[derive(Deserialize, Getters)]
617#[get = "pub(crate)"]
618#[serde(deny_unknown_fields)]
619pub(crate) struct Contamination {
620    /// name of contaminating sample
621    by: String,
622    /// fraction of contamination
623    fraction: f64,
624}
625
626#[derive(
627    Display, Debug, Clone, Serialize, Deserialize, IntoStaticStr, VariantNames, PartialEq, Eq, Hash,
628)]
629#[strum(serialize_all = "kebab_case")]
630pub(crate) enum Inheritance {
631    #[serde(rename = "mendelian")]
632    Mendelian { from: (String, String) },
633    #[serde(rename = "clonal")]
634    Clonal { from: String, somatic: bool },
635    #[serde(rename = "subclonal")]
636    Subclonal { from: String },
637}
638
639#[derive(
640    Display, Debug, Clone, Serialize, Deserialize, IntoStaticStr, VariantNames, PartialEq, Eq, Hash,
641)]
642#[serde(untagged, rename_all = "lowercase")]
643pub(crate) enum Sex {
644    Male,
645    Female,
646    Other(String),
647}
648
649#[derive(Deserialize, Debug)]
650#[serde(untagged)]
651pub(crate) enum UniverseDefinition {
652    Map(BTreeMap<String, VAFUniverse>),
653    Simple(VAFUniverse),
654}