Skip to main content

psect_core/
regression.rs

1use std::{cmp::Ordering, collections::HashSet, fmt::Debug, hash::Hash, iter::zip};
2
3use crate::distribution::TestOutcomeDistributions;
4
5pub trait Revision: Debug + Eq + Hash + PartialOrd {}
6
7fn normalize(p_v: &mut Vec<f64>) {
8    match p_v.iter().sum() {
9        0.0 => {}
10        sum => {
11            for p in p_v {
12                *p /= sum;
13            }
14        }
15    }
16}
17
18fn probability_of_outcome<T: Copy>(
19    sample: T,
20    distributions: &TestOutcomeDistributions<T>,
21    p_sample_before_regression: f64,
22) -> f64 {
23    return p_sample_before_regression * distributions.old.p(sample)
24        + (1.0 - p_sample_before_regression) * distributions.new.p(sample);
25}
26
27pub struct RegressionProbabilities<'a, R: Revision> {
28    revisions: &'a Vec<R>,
29
30    /// Probability that revision k is the first to follow the new distribution.
31    ps: Vec<f64>,
32}
33
34impl<'a, R: Revision> Clone for RegressionProbabilities<'a, R> {
35    fn clone(&self) -> Self {
36        RegressionProbabilities {
37            revisions: self.revisions,
38            ps: self.ps.clone(),
39        }
40    }
41}
42
43impl<'a, R: Revision> Debug for RegressionProbabilities<'a, R> {
44    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45        let mut debug_struct = f.debug_struct("RegressionProbabilities");
46        debug_struct.field("entropy", &format_args!("{:.3}", self.entropy()));
47        for (revision, p) in zip(self.revisions, &self.ps) {
48            debug_struct.field(&format!("{:.7?}", revision), &format_args!("{:.3?}", p));
49        }
50        debug_struct.finish()
51    }
52}
53
54impl<'a, R: Revision> RegressionProbabilities<'a, R> {
55    pub fn initialize(
56        revisions: &'a Vec<R>,
57        known_old: &HashSet<R>,
58    ) -> RegressionProbabilities<'a, R> {
59        let num_revisions = revisions.len();
60        let num_known_old_revisions = known_old.len();
61
62        // Known-old revisions cannot be the source of the regression, by assumption.
63        let num_possible_regression_revisions = num_revisions - num_known_old_revisions;
64        let initial_probability = 1.0 / num_possible_regression_revisions as f64;
65
66        let ps: Vec<f64> = revisions
67            .iter()
68            .map(|r| {
69                if known_old.contains(r) {
70                    0.0
71                } else {
72                    initial_probability
73                }
74            })
75            .collect();
76
77        RegressionProbabilities { revisions, ps }
78    }
79
80    pub fn update_with_sample(
81        &mut self,
82        distributions: &TestOutcomeDistributions<bool>,
83        sample_revision: usize,
84        sample_outcome: bool,
85    ) {
86        for (curr_revision, curr_p_regression) in self.ps.iter_mut().enumerate() {
87            let p_of_sample_for_revision = match sample_revision.partial_cmp(&curr_revision) {
88                Some(Ordering::Less) => distributions.old.p(sample_outcome),
89                Some(Ordering::Equal) | Some(Ordering::Greater) => {
90                    distributions.new.p(sample_outcome)
91                }
92                None => panic!("DAGs are not yet supported, revisions must be totally ordered"),
93            };
94            let prior = *curr_p_regression;
95            *curr_p_regression = p_of_sample_for_revision * prior;
96        }
97        normalize(&mut self.ps);
98    }
99
100    fn entropy(&self) -> f64 {
101        self.ps
102            .iter()
103            .filter(|p| **p != 0.0)
104            .map(|p| p * -p.log2())
105            .sum()
106    }
107
108    pub fn confidence(&self) -> f64 {
109        *self.ps.iter().max_by(|a, b| a.total_cmp(b)).unwrap()
110    }
111
112    pub fn most_likely_regression_revision(&self) -> &R {
113        zip(self.revisions, &self.ps)
114            .max_by(|(_, a), (_, b)| a.total_cmp(b))
115            .map(|(revision, _)| revision)
116            .unwrap()
117    }
118}
119
120fn estimate_entropy_after_testing<R: Revision>(
121    regression_ps: &RegressionProbabilities<R>,
122    distributions: &TestOutcomeDistributions<bool>,
123    sample_revision: usize,
124) -> f64 {
125    let p_sample_before_regression: f64 = regression_ps.ps[sample_revision + 1..].iter().sum();
126
127    log::debug!(
128        "Hypothesis: if we were to sample revision {} ({:.1}% likely pre-regression)...",
129        sample_revision,
130        p_sample_before_regression * 100.0
131    );
132
133    let possible_sample_outcomes = vec![false, true];
134    struct ResultPerOutcome {
135        likelihood: f64,
136        new_entropy: f64,
137    }
138
139    let expected_entropy: f64 = possible_sample_outcomes
140        .iter()
141        .map(|&sample_outcome| {
142            let p_sample_outcome =
143                probability_of_outcome(sample_outcome, distributions, p_sample_before_regression);
144
145            let mut new_regression_ps = regression_ps.clone();
146            new_regression_ps.update_with_sample(distributions, sample_revision, sample_outcome);
147
148            let entropy = new_regression_ps.entropy();
149            log::debug!(
150                "    {:.1}% chance it will test {}, resulting in {:?}",
151                p_sample_outcome * 100.0,
152                sample_outcome,
153                new_regression_ps
154            );
155
156            ResultPerOutcome {
157                likelihood: p_sample_outcome,
158                new_entropy: entropy,
159            }
160        })
161        .filter(|result| !result.new_entropy.is_nan())
162        .map(|result| result.likelihood * result.new_entropy)
163        .sum();
164
165    log::debug!(
166        "...we'd expect an information gain of {:.3} shannons.",
167        regression_ps.entropy() - expected_entropy
168    );
169
170    expected_entropy
171}
172
173pub fn next_revision_to_test<'a, R: Revision>(
174    regression_ps: &'a RegressionProbabilities<R>,
175    distributions: &'a TestOutcomeDistributions<bool>,
176) -> &'a R {
177    let num_revisions = regression_ps.ps.len();
178    let mut best_sample_revision = 0;
179    let mut lowest_expected_entropy = f64::INFINITY;
180    for sample_revision in 0..num_revisions {
181        let expected_entropy =
182            estimate_entropy_after_testing(regression_ps, distributions, sample_revision);
183        if expected_entropy < lowest_expected_entropy {
184            lowest_expected_entropy = expected_entropy;
185            best_sample_revision = sample_revision;
186        }
187    }
188    &regression_ps.revisions[best_sample_revision]
189}