1use std::{cmp::Ordering, collections::HashSet, fmt::Debug, hash::Hash, iter::zip};
2
3use crate::distribution::TestOutcomeDistributions;
4
5pub trait Revision: Debug + Eq + Hash + PartialOrd {}
6
7fn normalize(p_v: &mut Vec<f64>) {
8 match p_v.iter().sum() {
9 0.0 => {}
10 sum => {
11 for p in p_v {
12 *p /= sum;
13 }
14 }
15 }
16}
17
18fn probability_of_outcome<T: Copy>(
19 sample: T,
20 distributions: &TestOutcomeDistributions<T>,
21 p_sample_before_regression: f64,
22) -> f64 {
23 return p_sample_before_regression * distributions.old.p(sample)
24 + (1.0 - p_sample_before_regression) * distributions.new.p(sample);
25}
26
27pub struct RegressionProbabilities<'a, R: Revision> {
28 revisions: &'a Vec<R>,
29
30 ps: Vec<f64>,
32}
33
34impl<'a, R: Revision> Clone for RegressionProbabilities<'a, R> {
35 fn clone(&self) -> Self {
36 RegressionProbabilities {
37 revisions: self.revisions,
38 ps: self.ps.clone(),
39 }
40 }
41}
42
43impl<'a, R: Revision> Debug for RegressionProbabilities<'a, R> {
44 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45 let mut debug_struct = f.debug_struct("RegressionProbabilities");
46 debug_struct.field("entropy", &format_args!("{:.3}", self.entropy()));
47 for (revision, p) in zip(self.revisions, &self.ps) {
48 debug_struct.field(&format!("{:.7?}", revision), &format_args!("{:.3?}", p));
49 }
50 debug_struct.finish()
51 }
52}
53
54impl<'a, R: Revision> RegressionProbabilities<'a, R> {
55 pub fn initialize(
56 revisions: &'a Vec<R>,
57 known_old: &HashSet<R>,
58 ) -> RegressionProbabilities<'a, R> {
59 let num_revisions = revisions.len();
60 let num_known_old_revisions = known_old.len();
61
62 let num_possible_regression_revisions = num_revisions - num_known_old_revisions;
64 let initial_probability = 1.0 / num_possible_regression_revisions as f64;
65
66 let ps: Vec<f64> = revisions
67 .iter()
68 .map(|r| {
69 if known_old.contains(r) {
70 0.0
71 } else {
72 initial_probability
73 }
74 })
75 .collect();
76
77 RegressionProbabilities { revisions, ps }
78 }
79
80 pub fn update_with_sample(
81 &mut self,
82 distributions: &TestOutcomeDistributions<bool>,
83 sample_revision: usize,
84 sample_outcome: bool,
85 ) {
86 for (curr_revision, curr_p_regression) in self.ps.iter_mut().enumerate() {
87 let p_of_sample_for_revision = match sample_revision.partial_cmp(&curr_revision) {
88 Some(Ordering::Less) => distributions.old.p(sample_outcome),
89 Some(Ordering::Equal) | Some(Ordering::Greater) => {
90 distributions.new.p(sample_outcome)
91 }
92 None => panic!("DAGs are not yet supported, revisions must be totally ordered"),
93 };
94 let prior = *curr_p_regression;
95 *curr_p_regression = p_of_sample_for_revision * prior;
96 }
97 normalize(&mut self.ps);
98 }
99
100 fn entropy(&self) -> f64 {
101 self.ps
102 .iter()
103 .filter(|p| **p != 0.0)
104 .map(|p| p * -p.log2())
105 .sum()
106 }
107
108 pub fn confidence(&self) -> f64 {
109 *self.ps.iter().max_by(|a, b| a.total_cmp(b)).unwrap()
110 }
111
112 pub fn most_likely_regression_revision(&self) -> &R {
113 zip(self.revisions, &self.ps)
114 .max_by(|(_, a), (_, b)| a.total_cmp(b))
115 .map(|(revision, _)| revision)
116 .unwrap()
117 }
118}
119
120fn estimate_entropy_after_testing<R: Revision>(
121 regression_ps: &RegressionProbabilities<R>,
122 distributions: &TestOutcomeDistributions<bool>,
123 sample_revision: usize,
124) -> f64 {
125 let p_sample_before_regression: f64 = regression_ps.ps[sample_revision + 1..].iter().sum();
126
127 log::debug!(
128 "Hypothesis: if we were to sample revision {} ({:.1}% likely pre-regression)...",
129 sample_revision,
130 p_sample_before_regression * 100.0
131 );
132
133 let possible_sample_outcomes = vec![false, true];
134 struct ResultPerOutcome {
135 likelihood: f64,
136 new_entropy: f64,
137 }
138
139 let expected_entropy: f64 = possible_sample_outcomes
140 .iter()
141 .map(|&sample_outcome| {
142 let p_sample_outcome =
143 probability_of_outcome(sample_outcome, distributions, p_sample_before_regression);
144
145 let mut new_regression_ps = regression_ps.clone();
146 new_regression_ps.update_with_sample(distributions, sample_revision, sample_outcome);
147
148 let entropy = new_regression_ps.entropy();
149 log::debug!(
150 " {:.1}% chance it will test {}, resulting in {:?}",
151 p_sample_outcome * 100.0,
152 sample_outcome,
153 new_regression_ps
154 );
155
156 ResultPerOutcome {
157 likelihood: p_sample_outcome,
158 new_entropy: entropy,
159 }
160 })
161 .filter(|result| !result.new_entropy.is_nan())
162 .map(|result| result.likelihood * result.new_entropy)
163 .sum();
164
165 log::debug!(
166 "...we'd expect an information gain of {:.3} shannons.",
167 regression_ps.entropy() - expected_entropy
168 );
169
170 expected_entropy
171}
172
173pub fn next_revision_to_test<'a, R: Revision>(
174 regression_ps: &'a RegressionProbabilities<R>,
175 distributions: &'a TestOutcomeDistributions<bool>,
176) -> &'a R {
177 let num_revisions = regression_ps.ps.len();
178 let mut best_sample_revision = 0;
179 let mut lowest_expected_entropy = f64::INFINITY;
180 for sample_revision in 0..num_revisions {
181 let expected_entropy =
182 estimate_entropy_after_testing(regression_ps, distributions, sample_revision);
183 if expected_entropy < lowest_expected_entropy {
184 lowest_expected_entropy = expected_entropy;
185 best_sample_revision = sample_revision;
186 }
187 }
188 ®ression_ps.revisions[best_sample_revision]
189}