hmeasure/
hm.rs

1/*!
2The hmeasure module provides an implementation of:
3CostRatioDensity: a struct to enable generating cost distributions using the Beta distribution.
4HMeasure: a struct to enable generating the H-Measure metric, given a cost distribution and binary classification scores
5for a pair of classes (class0 and class1). Priors for the class scores may be optionally specified, or inferred from
6the data provided.
7The scores are generated exogenously by a model (the hmeasure module is agnostic to the source of the scores).
8HMeasureResults: a struct to receive the results of the HMeasure calculation and enable saving to file.
9*/
10use std::io::Write;
11use std::fs::{OpenOptions, File};
12
13use statrs::distribution::{Beta, Continuous, ContinuousCDF};
14use array2d::Array2D;
15use quadrature::double_exponential;
16
17use crate::datagen;
18use datagen::{BetaParams, BinaryClassScores};
19
20/**
21CostRatioDensity: a struct to enable generating cost distributions using the Beta distribution
22*/
23#[derive(Debug)]
24pub struct CostRatioDensity{
25    pub c_density_obj: Beta
26}
27
28impl CostRatioDensity {
29    pub fn new(bparms: BetaParams) -> CostRatioDensity {
30        CostRatioDensity {
31            c_density_obj: Beta::new(bparms.alpha, bparms.beta).unwrap()
32        }
33    }
34    /**
35    A convenience helper function for a component of the H-Measure numerical integration.
36    */
37    pub fn uc(&self, cost: f64) -> f64 {
38        cost*self.c_density_obj.pdf(cost)
39    }
40    /**
41    A convenience helper function for a component of the H-Measure numerical integration.
42    */
43    pub fn u1mc(&self, cost: f64) -> f64 {
44        (1.0-cost)*self.c_density_obj.pdf(cost)
45    }
46    /**
47    The cumulative probability density function of the CostRatio distribution.
48    */
49    pub fn cdf(&self, cost: f64) -> f64 {self.c_density_obj.cdf(cost) }
50    /**
51    The probability density function of the CostRatio distribution.
52    */
53    pub fn pdf(&self, cost: f64) -> f64 {
54        self.c_density_obj.pdf(cost)
55    }
56}
57
58/**
59HMeasure: a struct to enable generating the H-Measure metric, given a cost distribution and binary classification scores
60for a pair of classes (class0 and class1). Priors for the class scores may be optionally specified, or inferred from
61the data provided.
62The scores are generated exogenously by a model (the hmeasure module is agnostic to the source of the scores).
63*/
64#[derive(Debug)]
65pub struct HMeasure{
66    cost_distribution: CostRatioDensity,
67    class0_prior: Option<f64>,
68    class1_prior: Option<f64>,
69    c0_num: Option<usize>,
70    c1_num: Option<usize>,
71    pub h: Option<f64>
72}
73
74/**
75HMeasureResults: a struct to receive the results of the HMeasure calculation and enable saving to file
76*/
77#[derive(Debug)]
78pub struct HMeasureResults{
79    pub h: f64,
80    pub convex_hull: Vec<Vec<f64>>,
81    pub roc_curve: Vec<Vec<f64>>,
82    pub int_components: Vec<Vec<f64>>,
83    pub class0_score: Vec<f64>,
84    pub class1_score: Vec<f64>
85}
86
87/**
88A trait for saving model results to file given a file handle.
89*/
90pub trait SaveData<File> {
91    fn save(&self, f: &mut File);
92}
93
94impl<File: Write> SaveData<File> for Vec<f64>{
95    fn save(&self, f: &mut File){
96        let sep = ",";
97        for (idx, element) in self.iter().enumerate() {
98                f.write_all(idx.to_string().as_bytes()).expect("the unexpected");
99                f.write_all(sep.as_bytes()).expect("the unexpected");
100                f.write_all(element.to_string().as_bytes()).expect("the unexpected");
101                f.write_all(b"\n").expect("the unexpected");
102            }
103    }
104}
105
106impl<File: Write> SaveData<File> for Array2D<f64>{
107    fn save(&self, f: &mut File){
108        let num_rows = self.column_len();
109        let num_cols = self.row_len();
110        let mut i = 0;
111        let sep = ",";
112        while i < num_rows {
113            let mut j = 0;
114            f.write_all(i.to_string().as_bytes()).expect("the unexpected");
115            f.write_all(sep.as_bytes()).expect("the unexpected");
116            while j < num_cols {
117                let element = self[(i,j)];
118                f.write_all(element.to_string().as_bytes()).expect("the unexpected");
119                if j < num_cols-1 {
120                    f.write_all(sep.as_bytes()).expect("the unexpected");
121                }
122                j += 1;
123            }
124            f.write_all(b"\n").expect("the unexpected");
125            i += 1;
126        }
127    }
128}
129
130impl<File: Write> SaveData<File> for Vec<Vec<f64>>{
131    fn save(&self, f: &mut File){
132        let num_rows = self.len();
133        let num_cols = self[0].len();
134        let mut i = 0;
135        let sep = ",";
136        while i < num_rows {
137            let mut j = 0;
138            f.write_all(i.to_string().as_bytes()).expect("the unexpected");
139            f.write_all(sep.as_bytes()).expect("the unexpected");
140            while j < num_cols {
141                let element = self[i][j];
142                f.write_all(element.to_string().as_bytes()).expect("the unexpected");
143                if j < num_cols-1 {
144                    f.write_all(sep.as_bytes()).expect("the unexpected");
145                }
146                j += 1;
147            }
148            f.write_all(b"\n").expect("the unexpected");
149            i += 1;
150        }
151    }
152}
153
154/**
155A generic for saving model results to file for types that have implemented the SaveData trait.
156*/
157pub fn write_vec<T: SaveData<File>>(vec: &T, filepath: &str, header: &str){
158    let f = &mut OpenOptions::new()
159        .append(true)
160        .create(true)
161        .open(filepath)
162        .expect("the unexpected");
163
164    f.write_all(header.as_bytes()).expect("the unexpected");
165    f.write_all(b"\n").expect("the unexpected");
166    vec.save(f)
167}
168
169/**
170Implementation of the save function for persisting H-Measure model results to file.
171*/
172impl HMeasureResults{
173    pub fn save(&self, result_set: &str, filepath: &str){
174        let chull_name = self.result_set_path("ch", filepath, result_set);
175        write_vec(&self.convex_hull, chull_name.as_str(), "idx,chull0,chull1");
176
177        let roc_name = self.result_set_path("rc", filepath, result_set);
178        write_vec(&self.roc_curve, roc_name.as_str(), "idx,roc0,roc1");
179
180        let intcomp_name = self.result_set_path("ic", filepath, result_set);
181        write_vec(&self.int_components, intcomp_name.as_str(), "idx,ic0,ic1,ic2,ic3");
182
183        let score0_name = self.result_set_path("s0", filepath, result_set);
184        write_vec(&self.class0_score, score0_name.as_str(), "idx,score");
185
186        let score1_name = self.result_set_path("s1", filepath, result_set);
187        write_vec(&self.class1_score, score1_name.as_str(), "idx,score");
188    }
189    pub fn result_set_path(&self, set_name: &str, filepath: &str, resultset: &str) -> String{
190        let name = format!("{}/{}_{}.csv", filepath, set_name, resultset);
191        name
192    }
193}
194
195/**
196Implementation of the HMeasure functions for calculating the H-Measure metric.
197*/
198impl HMeasure{
199    pub fn new(cost_distribution: CostRatioDensity, class0_prior: Option<f64>, class1_prior: Option<f64>) -> HMeasure {
200        HMeasure {
201            cost_distribution: cost_distribution,
202            class0_prior: class0_prior,
203            class1_prior: class1_prior,
204            c0_num: None,
205            c1_num: None,
206            h: None
207        }
208    }
209
210    pub fn h_measure(&mut self, scores: &mut BinaryClassScores) -> HMeasureResults {
211        self.c0_num = Some(scores.class0.len());
212        self.c1_num = Some(scores.class1.len());
213        let num_scores = self.c0_num.unwrap_or(0)+self.c1_num.unwrap_or(0);
214        if self.class0_prior.is_none() || self.class1_prior.is_none() {
215            self.class0_prior = Some(self.c0_num.unwrap_or(0) as f64/num_scores as f64);
216            self.class1_prior = Some(self.c1_num.unwrap_or(0) as f64/num_scores as f64);
217        }
218        scores.class0.sort_by(|a, b| a.partial_cmp(b).unwrap());
219        scores.class1.sort_by(|a, b| a.partial_cmp(b).unwrap());
220        let (c_scores, c_classes) = self.merge_scores(&scores.class0, &scores.class1);
221        let roc_curve = self.build_roc(&c_scores, &c_classes);
222        let (convex_hull, int_components) = self.build_chull(&roc_curve);
223        self.h = self._build_h(&int_components);
224
225        let h = self.h.unwrap();
226        let class0_score = scores.class0.clone();
227        let class1_score = scores.class1.clone();
228        HMeasureResults{ h,
229                        convex_hull,
230                        roc_curve,
231                        int_components,
232                        class0_score,
233                        class1_score
234        }
235    }
236
237    fn merge_scores(&self, c0: &Vec<f64>, c1: &Vec<f64>) -> (Vec<f64>, Vec<u8>) {
238        let mut c0_i = 0;
239        let mut c1_i = 0;
240        let mut cm_i = 0;
241        let c0_num= self.c0_num.unwrap_or(0);
242        let c1_num= self.c1_num.unwrap_or(0);
243        let vec_size = c0_num + c1_num;
244        let mut c_scores = vec![0.0;vec_size];
245        let mut c_classes = vec![0;vec_size];
246
247        while c0_i < c0_num || c1_i < c1_num {
248            if c0_i < c0_num && c1_i < c1_num {
249                if c0[c0_i] <= c1[c1_i] {
250                    c_scores[cm_i] = c0[c0_i];
251                    c_classes[cm_i] = 0;
252                    c0_i += 1;
253                    cm_i += 1;
254                } else if c1[c1_i] <= c0[c0_i] {
255                    c_scores[cm_i] = c1[c1_i];
256                    c_classes[cm_i] = 1;
257                    c1_i += 1;
258                    cm_i += 1;
259                }
260            } else if c0_i < c0_num && c1_i >= c1_num {
261                c_scores[cm_i] = c0[c0_i];
262                c_classes[cm_i] = 0;
263                c0_i += 1;
264                cm_i += 1;
265            } else if c1_i < c1_num && c0_i >= c0_num {
266                c_scores[cm_i] = c1[c1_i];
267                c_classes[cm_i] = 1;
268                c1_i += 1;
269                cm_i += 1;
270            }
271        }
272
273        (c_scores, c_classes)
274    }
275
276    fn build_roc(&self, c_scores: &Vec<f64>, c_classes: &Vec<u8>) -> Vec<Vec<f64>> {
277        let num_rows = c_scores.len();
278        let mut roc_points: Vec<Vec<f64>> = vec![];
279        let mut roc_point = [0.0, 0.0];
280        roc_points.push(vec![roc_point[0], roc_point[1]]);
281        let mut i = 0;
282        while i < num_rows {
283            let (duplicate_0, duplicate_1) = HMeasure::duplicate_classes(c_scores, c_classes, i);
284            let duplicate_i = duplicate_0 + duplicate_1;
285            roc_point[0] += duplicate_1 as f64 / self.c1_num.unwrap_or(1) as f64;
286            roc_point[1] += duplicate_0 as f64 / self.c0_num.unwrap_or(1) as f64;
287            roc_points.push(vec![roc_point[0], roc_point[1]]);
288            i += duplicate_i;
289        }
290        roc_points
291    }
292
293    fn duplicate_classes(scores: &Vec<f64>, classes: &Vec<u8>, idx: usize) -> (usize,usize) {
294        let mut dup_class_0 = vec![];
295        let mut dup_class_1 = vec![];
296        let score_len = scores.len();
297        // scores and classes are sorted already here, so only iterate over the range from idx upwards
298        // and break once the scores are greater than scores[idx]
299        for j in idx..score_len{
300            if scores[j] == scores[idx] {
301                if classes[j] == 0 {
302                    dup_class_0.push(classes[j]);
303                }
304                else {
305                    dup_class_1.push(classes[j]);
306                }
307            } else if scores[j] > scores[idx] {
308                break
309            }
310        }
311
312        (dup_class_0.len(), dup_class_1.len())
313    }
314
315    fn build_chull(&self, roc_c: &Vec<Vec<f64>>) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
316        let num_rows = roc_c.len();
317        let mut chull: Vec<Vec<f64>> = vec![];
318        let mut int_components: Vec<Vec<f64>> = vec![];
319        let mut cval_loc_prior = [0.0, 0.0];
320        let mut cval_prior = 0.0;
321        let mut i = 0;
322        while i < num_rows-1 {
323            let cvals = self._cvals(&roc_c, i);
324            let c_argmin = HMeasure::argmin(&cvals);
325            let cval_loc = [roc_c[i+c_argmin+1][0],roc_c[i+c_argmin+1][1]];
326            chull.push(vec![cval_loc[0],cval_loc[1]]);
327            int_components.push(vec![cvals[c_argmin], cval_prior, cval_loc_prior[0], cval_loc_prior[1]]);
328            cval_loc_prior = cval_loc;
329            cval_prior = cvals[c_argmin];
330            i += c_argmin+1;
331        }
332
333        int_components.push(vec![1.0, cval_prior, cval_loc_prior[0], cval_loc_prior[1]]);
334
335        (chull, int_components)
336    }
337
338    fn _cvals(&self, roc_c: &Vec<Vec<f64>>, current_idx: usize) -> Vec<f64> {
339        let mut cvals: Vec<f64> = vec![];
340        let current_point: [f64;2] = [roc_c[current_idx][0],roc_c[current_idx][1]];
341        let mut i_point_idx = current_idx + 1;
342        while i_point_idx < roc_c.len() {
343            let i_point: [f64;2] = [roc_c[i_point_idx][0],roc_c[i_point_idx][1]];
344            cvals.push(self._cval(&current_point, &i_point));
345            i_point_idx += 1;
346        }
347        cvals
348    }
349
350    fn _cval(&self, current_point: &[f64; 2], i_point: &[f64;2]) -> f64 {
351        let numerator = self.class1_prior.unwrap_or(0.5)*(i_point[0]-current_point[0]);
352        let denominator = self.class0_prior.unwrap_or(0.5)*(i_point[1]-current_point[1]) + self.class1_prior.unwrap_or(0.5)*(i_point[0]-current_point[0]);
353        numerator/denominator
354    }
355
356    fn argmin(v: &Vec<f64>) -> usize {
357        let mut min_idx = 0;
358        let mut min_val = v[0];
359        for (i, &val) in v.iter().enumerate() {
360            if val < min_val {
361                min_idx = i;
362                min_val = val;
363            }
364        }
365        min_idx
366    }
367
368    fn _build_h(&self, int_components: &Vec<Vec<f64>>) -> Option<f64> {
369        let l = self._build_l(&int_components).unwrap();
370        let l_max = self._l_max().unwrap();
371        // println!("l : l_max ~ {:?} : {:?}", l, l_max);
372        Some(1.0-l/l_max)
373    }
374
375    fn _build_l(&self, int_components: &Vec<Vec<f64>>) -> Option<f64> {
376        let mut l = 0.0;
377        let mut i = 0;
378        while i < int_components.len() {
379            let int_vals = &int_components[i];
380            l += self._int_l_step(int_vals);
381            i += 1
382        }
383        Some(l)
384    }
385
386    fn _int_l_step(&self, int_vals: &Vec<f64>) -> f64 {
387        let r0i = int_vals[3];
388        let r1i = int_vals[2];
389        let ci = int_vals[1];
390        let cip1 = int_vals[0];
391        let c0_prior = self.class0_prior.unwrap();
392        let c1_prior = self.class1_prior.unwrap();
393        let int_uc = double_exponential::integrate(|x| self.cost_distribution.uc(x), ci, cip1, 1e-9).integral;
394        let int_u1mc = double_exponential::integrate(|x| self.cost_distribution.u1mc(x), ci, cip1, 1e-9).integral;
395
396        let l_step = c0_prior*(1.0-r0i)*int_uc + c1_prior*r1i*int_u1mc;
397
398        return l_step
399    }
400
401    fn _l_max(&self) -> Option<f64> {
402        let c0_prior = self.class0_prior.unwrap();
403        let c1_prior = self.class1_prior.unwrap();
404        let mut l_max = double_exponential::integrate(|x| self.cost_distribution.uc(x), 0.0, c1_prior, 1e-9).integral*c0_prior;
405        l_max += double_exponential::integrate(|x| self.cost_distribution.u1mc(x), c1_prior, 1.0,1e-9).integral*c1_prior;
406        Some(l_max)
407    }
408}
409
410#[cfg(test)]
411mod tests {
412    use super::{CostRatioDensity,HMeasure};
413    use super::datagen::{BetaParams, BinaryClassifierScores, BinaryClassParams};
414    #[test]
415    fn assert_low_hmeasure() {
416        // Choose class0 and class1 to have identical score distributions => the classifier model that generated the
417        // scores is unable to discriminate between class0 and class1 (it is poor model). Demonstrate near-zero
418        // H-Measure in this case
419        let c0_a: f64 = 4.0;
420        let c1_a: f64 = 4.0;
421        let c0_b: f64 = 4.0;
422        let c1_b: f64 = 4.0;
423        let c0_n: usize = 2000;
424        let c1_n: usize = 2000;
425        // seed the random number generator for reproducibility.
426        let mut rng = BinaryClassifierScores::generate_rng(13);
427        // generate dummy score data for the pair of binary classes: class0 and class1
428        let class0_params = BetaParams { alpha: c0_a, beta: c0_b };
429        let class1_params = BetaParams { alpha: c1_a, beta: c1_b };
430        let bcp = &BinaryClassParams { class0: class0_params, class1: class1_params };
431        let mut bcs = BinaryClassifierScores::new(&bcp, c0_n, c1_n, &mut rng);
432
433        // specify a cost ratio density
434        let cost_density_params = BetaParams { alpha: 2.0, beta: 2.0 };
435        let crd = CostRatioDensity::new(cost_density_params);
436
437        // calculate the H-Measure given the cost ratio density and scores
438        let mut hm = HMeasure::new(crd, None, None);
439        let scores = &mut bcs.scores;
440        let hmr = hm.h_measure(scores);
441        // Note : the H-Measure will not in general be identically zero in this test, despite the
442        // distributions of the class0 and class1 scores being identical. This is a natural property of
443        // a random sample-based measure (the sampled values from the two distributions will in general be
444        // different - although for large sample size, the effect should be small).
445        assert!(hmr.h < 0.001);
446    }
447}