coreset 0.1.1

Coreset and (streaming) clustering
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
//!
//! The module implements variants of the Mettu-Plaxton algorithm:
//!  1. Facility Location in sublinear time.   
//!     Badoiu, Czumaj, Indyk, Sohler ICALP 2005
//!     see [Badoiu](https://people.csail.mit.edu/indyk/fl.pdf).  
//!
//!     This algorithm is restricted to unweighted data and builds upon the following paper:     
//!        
//!  2. The online median problem.     
//!     Mettu-Plaxton Siam 2003 [online-median](https://epubs.siam.org/doi/10.1137/S0097539701383443).
//!    
//!    This algorithm accepts weighted data.
//!
//!  The data are of type Vec\<T\> where T can be anything as long as the hnsw crate provides on these vectors.  
//!  (see [hnsw_rs](https://docs.rs/hnsw_rs/0.1.19/hnsw_rs/dist/index.html))
//!
//!  The bmor [Bmor](crate::bmor::Bmor) is really faster but it can be easier to control the number of faciities allocated by
//!  the algorithms implemented in this module. see [construct_centers](self::MettuPlaxton::construct_centers())
//!

use parking_lot::RwLock;
use rayon::prelude::*;

use rand_xoshiro::rand_core::SeedableRng;
use rand_xoshiro::Xoshiro256PlusPlus;

use rand::distr::{Distribution, Uniform}; // we could use also greenwald_khanna

use anndists::dist::*;

use crate::facility::*;
use crate::scale::*;

//==================================================================================

/// Mettu-Plaxton algorithm with Indyk simplification as described in  [indyk](https://people.csail.mit.edu/indyk/fl.pdf).  
/// This algorithm suppose uniform weights on data.  
/// For weihted data see [WeightedMettuPlaxton]
///
pub struct MettuPlaxton<'b, T: Send + Sync, Dist: Distance<T>> {
    //
    nb_data: usize,
    //
    data: &'b [Vec<T>],
    // j is integer value of log data.len()
    j: u32,
    //
    distance: Dist,
    //
    rng: Xoshiro256PlusPlus,
} // end of struct MettuPlaxton

impl<'b, T: Send + Sync + Clone, Dist: Distance<T>> MettuPlaxton<'b, T, Dist> {
    /// initialization for unweighted data
    pub fn new(data: &'b [Vec<T>], distance: Dist) -> Self {
        let nb_data: usize = data.len();
        let j: u32 = data.len().ilog2();
        //
        MettuPlaxton {
            nb_data,
            data,
            j,
            distance,
            rng: Xoshiro256PlusPlus::seed_from_u64(123),
        }
    }

    pub fn get_nb_data(&self) -> usize {
        self.nb_data
    }

    // estimate cardinal around point in a radius of 2^-j.
    // sample K = c * 2^{-j} * n log n points to estimate N = |B(point , 2−j )|
    // return n * N /K
    // running time O(r * n * log(n)).
    // to be called in //
    fn estimate_ball_cardinal(&self, (ip, point): (usize, &Vec<T>), scale: f32) -> (usize, f32)
    where
        Dist: Sync,
    {
        //
        let c = 2.;
        let mut j_tmp = self.j;
        // at beginning nb_sample = c * self.j i.e c * log(nb_data) and double at each iteration
        let mut rng = self.rng.clone();
        let mut iter_num = 0;
        rng.jump();
        let unif = Uniform::<usize>::new(0, self.nb_data).unwrap();
        let r: f32 = loop {
            let r_test = scale / 2_u32.pow(j_tmp) as f32;
            let nb_sample_f: f32 = c * (self.nb_data as f32 * r_test / scale) / self.j as f32;
            let nb_sample: u32 = nb_sample_f.trunc() as u32;
            log::trace!("estimate_ball_cardinal nb_sample : {:?}", nb_sample);
            let mut nb_in = 1; // count center in!
            for _ in 0..nb_sample {
                // sample and compute distance to point ip
                let k = unif.sample(&mut rng);
                let dist = self.distance.eval(point, &self.data[k]);
                if dist <= r_test {
                    nb_in += 1;
                }
            }
            //
            if nb_in >= 2_usize.pow(j_tmp) {
                log::debug!("estimate_ball_cardinal for point {:?} ; nb_iter = {:?}, cardinal : {:?}, radius : {:.3e}", ip, iter_num, nb_in, r_test);
                // an estimator of radius is 1/2^j
                break r_test;
            } else {
                if j_tmp < 1 {
                    log::error!("error in estimate_ball_cardinal, j_tmp becomes negative");
                    std::process::exit(1);
                }
                j_tmp -= 1;
                iter_num += 1;
            }
        };
        (ip, r)
    }

    /// construct centers (facilities) for a given distance and returns allocated facilities (or centers)
    /// The parameter alfa drives the number of facilities created.
    pub fn construct_centers(&self, alfa: f32) -> Facilities<usize, T, Dist>
    where
        Dist: Send + Sync + Clone,
    {
        // get scales
        let q_dist = get_neighborhood_size(1_000_000, self.data, &self.distance);
        let threshold = q_dist.query(0.999).unwrap().1;
        log::info!("dist medi : {:.3e}", threshold);
        //
        let mut facilities: Facilities<usize, T, Dist> =
            Facilities::<usize, T, Dist>::new(self.j as usize, self.distance.clone());
        // estimate radii in //
        let value_to_match = alfa * threshold;
        let mut radii: Vec<(usize, f32)> = (0..self.nb_data)
            .into_par_iter()
            .map(|i| self.estimate_ball_cardinal((i, &self.data[i]), value_to_match))
            .collect();
        log::debug!("estimate_ball_cardinal done");
        // sort radii
        radii.sort_unstable_by(|it1, it2| it1.1.partial_cmp(&it2.1).unwrap());
        assert!(radii.first().unwrap().1 <= radii.last().unwrap().1);
        // facility allocation, loop on data, check for existing facility around each point
        for p in radii.iter() {
            let matched = facilities.match_point(&self.data[p.0], 2. * p.1, &self.distance);
            if !matched {
                // we inset a facility
                let facility = Facility::new(p.0, &self.data[p.0]);
                facilities.insert(facility);
                log::debug!("inserted facility at {:?}, radius : {:.3e}", p.0, p.1);
            }
        }
        // We explicitly dispatch data to facilities as imp algo do not do it
        // let data_unweighted: Vec<&Vec<T>> = self.data.iter().map(|d| d).collect();
        //        facilities.dispatch_data(&data_unweighted, None);
        //
        facilities
    } // end of construct_centers

    pub fn compute_distances(&self, facilities: &Facilities<usize, T, Dist>)
    where
        Dist: Send + Sync,
    {
        //
        facilities.cross_distances();
    } // end of compute_cost
} // end of impl block

//======================================================================================

/// Mettu-Plaxton online median algorithm Siam 2003 [online-median](https://epubs.siam.org/doi/10.1137/S0097539701383443).  
/// This algorithm can handle weighted data but its complexity is O(n²).  
///
/// It is adapted to final processing of Coreset algorithms where number of items to process has been log reduced (bmor module algos   
/// or blackbox as in Chen K., On Coresets Kmedian Clustering Metric Spaces and Applications 2009 Siam J. Computing)
pub struct WeightedMettuPlaxton<'b, T: Send + Sync, Dist: Distance<T>> {
    //
    nb_data: usize,
    //
    data: &'b Vec<Vec<T>>,
    //
    weights: &'b Vec<f32>,
    // j is integer value of log data.len()
    j: u32,
    //
    distance: Dist,
    //
    _rng: Xoshiro256PlusPlus,
} // end of struct WeightedMettuPlaxton

impl<'b, T: Send + Sync + Clone, Dist: Distance<T> + Send + Sync + Clone>
    WeightedMettuPlaxton<'b, T, Dist>
{
    /// initialization for unweighted data
    pub fn new(data: &'b Vec<Vec<T>>, weights: &'b Vec<f32>, distance: Dist) -> Self {
        let nb_data: usize = data.len();
        let j: u32 = data.len().ilog2();
        //
        WeightedMettuPlaxton {
            nb_data,
            data,
            weights,
            j,
            distance,
            _rng: Xoshiro256PlusPlus::seed_from_u64(123),
        }
    }

    pub fn get_nb_data(&self) -> usize {
        self.nb_data
    }

    // compute all cross distances
    fn compute_all_dists(&self) -> Vec<RwLock<Vec<f32>>> {
        log::debug!("in WeightedMettuPlaxton::compute_all_dists");
        //
        let mut dists: Vec<RwLock<Vec<f32>>> = Vec::<RwLock<Vec<f32>>>::with_capacity(self.nb_data);
        //
        for _ in 0..self.nb_data {
            let d: Vec<f32> = (0..self.nb_data).map(|_| -1.).collect();
            dists.push(RwLock::new(d));
        }

        let compute_for_i = |i: usize| {
            let mut dist_i: Vec<f32> = (0..self.nb_data).map(|_| -1.).collect();
            for j in 0..self.nb_data {
                // has symetric benn computed?
                let dist = dists[j].read()[i];
                let dist_i_j = if dist < 0. {
                    self.distance.eval(&self.data[i], &self.data[j])
                } else {
                    dist
                };
                dist_i[j] = dist_i_j;
            } // end of for j
            *dists[i].write() = dist_i;
            1
        };
        //
        let _res: Vec<i32> = (0..self.nb_data)
            .into_par_iter()
            .map(compute_for_i)
            .collect();
        // now we have all dists
        dists
    } // end of compute_all_dists

    fn compute_ball_radius(&self, alfa: f32, dists: &RwLock<Vec<f32>>) -> f32 {
        //
        log::debug!(
            "\n\n WeightedMettuPlaxton compute_ball_radius , coeff value to match {:.3e}",
            alfa
        );
        log::debug!(
            "WeightedMettuPlaxton compute_ball_radius , radii  {:?}",
            dists.read()
        );
        //
        // we sort distances
        let mut indexed_dist: Vec<(usize, f32)> = (0..self.nb_data)
            .zip(dists.read().iter())
            .map(|(i, f)| (i, *f))
            .collect();
        indexed_dist.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
        log::debug!("indexed_dist : {:?}", indexed_dist);
        //
        // first component store weight cumul , second component stores weight * dist cumul
        //
        let mut cumulated_values = Vec::<(f32, f32)>::with_capacity(self.get_nb_data());
        cumulated_values.push((
            self.weights[indexed_dist[0].0],
            indexed_dist[0].1 * self.weights[indexed_dist[0].0],
        ));
        for j in 1..self.get_nb_data() {
            let weight = cumulated_values[j - 1].0 + self.weights[indexed_dist[j].0];
            let indexed_weight =
                cumulated_values[j - 1].1 + indexed_dist[j].1 * self.weights[indexed_dist[j].0];
            cumulated_values.push((weight, indexed_weight));
        }
        assert_eq!(cumulated_values.len(), self.get_nb_data());
        log::debug!("cumulated_values :  {:?}", cumulated_values);
        // compute value at ball centered a ball with radius indexed_dist[j].1 (see paper)
        let value_at_j =
            |j: usize| -> f32 { indexed_dist[j].1 * cumulated_values[j].0 - cumulated_values[j].1 };
        //
        let radius_index = self.nb_data.ilog2().max(1);
        let value = alfa * value_at_j(radius_index.try_into().unwrap());
        log::debug!("value to match : {:.2e}", value);
        if log::log_enabled!(log::Level::Debug) {
            let check: Vec<f32> = (0..self.get_nb_data()).map(value_at_j).collect();
            log::debug!("check : {:?}", check);
        }
        //
        // now we must find greatest j such that value_atj(j) <= value
        // check last value
        let upper_value = value_at_j(self.get_nb_data() - 1);
        log::debug!(
            "imp::compute_ball_radius upper_value : {:.3e} value : {:.3e}",
            upper_value,
            value
        );
        let radius: f32;
        if upper_value <= value {
            log::info!(
                "value is too large upper_value : {:.3e} value : {:.3e}",
                upper_value,
                value
            );
            // we can solve for a large r directly
            // radius = value - upper_value;
            std::panic!("not yet implemented");
        } else {
            let mut upper_index = self.get_nb_data() - 1;
            let mut lower_index = 0;
            let mut middle_index = (upper_index + lower_index) / 2;
            let mut value_at_upper = upper_value;
            let mut value_at_lower = 0.;
            while upper_index - lower_index > 1 {
                log::trace!(
                    "lower index: {:?}, upper index : {:?} value lower : {:?}, value upper {:?}",
                    lower_index,
                    upper_index,
                    value_at_lower,
                    value_at_upper
                );
                let value_at_middle = value_at_j(middle_index);
                if value_at_middle > value {
                    upper_index = middle_index;
                    value_at_upper = value_at_middle;
                } else {
                    lower_index = middle_index;
                    value_at_lower = value_at_middle;
                }
                middle_index = (upper_index + lower_index) / 2;
            } // end while
            log::debug!(
                "lower index : {:?}, upper index : {:?}, value lower  : {:?}, value upper : {:?} ",
                lower_index,
                upper_index,
                value_at_lower,
                value_at_upper
            );
            radius = indexed_dist[lower_index].1
                + (value - value_at_lower) / cumulated_values[lower_index].0;
            log::debug!("got radius : {:?}", radius);
            assert!(radius >= indexed_dist[lower_index].1);
            assert!(radius < indexed_dist[upper_index].1);
        }
        //
        if radius > 0. {
            radius
        } else {
            std::panic!("error in compute_ball_radius, radius : {:?}", radius);
        }
    } // end of compute_ball_radius

    //
    // TODO: to made //
    // alfa is a coefficient to modulate number of facilities created. 0.5 seems a good guess.
    // increasing alfa reduce the number of facilities and reducing alfa makes facility creation easier.
    fn compute_balls_at_value(
        &self,
        alfa: f32,
        dists: &[RwLock<Vec<f32>>],
    ) -> Facilities<usize, T, Dist> {
        //
        log::debug!("in WeightedMettuPlaxton::compute_balls_at_value");
        // for each point compute ball around it of given value
        // corresponds to step 1 of algorithm 2.1 paper [online-median](https://epubs.siam.org/doi/10.1137/S0097539701383443)
        let mut radii: Vec<(usize, f32)> = (0..self.nb_data)
            .into_par_iter()
            .map(|i| (i, self.compute_ball_radius(alfa, &dists[i])))
            .collect();
        // sort by increasing radius (step 2 of algo)
        radii.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
        // radii[4] corresponds to point of original index radii[4].0 and so distance to its neighbours are given by dists[radii[4].0]
        let mut facilities: Facilities<usize, T, Dist> =
            Facilities::<usize, T, Dist>::new(self.j as usize, self.distance.clone());
        // we can do step 3 and 4 of algo 2.1
        for p in radii.iter() {
            let matched = facilities.match_point(&self.data[p.0], 2. * p.1, &self.distance);
            if !matched {
                // we insert a facility
                let mut facility = Facility::new(p.0, &self.data[p.0]);
                facility.insert(self.weights[p.0] as f64, p.1);
                log::info!(
                    "inserting facility at {:?}, radius : {:.3e}, weight : {:.3e}",
                    p.0,
                    p.1,
                    facility.get_weight()
                );
                facilities.insert(facility);
            }
        }
        //
        facilities
    } // end of compute_balls_at_value

    /// alfa governs the cost of facility creation so the number of facilities we will get.
    /// alfa = 0.5 is a good value.  
    /// To reduce number of facilities produced increase alfa and inversely
    /// reducing alfa increase the number of facilities
    pub fn construct_centers(&self, alfa: f32) -> Facilities<usize, T, Dist> {
        //
        log::info!(
            "in WeightedMettuPlaxton::construct_centers alfa : {:.3e}",
            alfa
        );
        //
        let dists: Vec<RwLock<Vec<f32>>> = self.compute_all_dists();
        //
        let mut facilities = self.compute_balls_at_value(alfa, &dists);
        //
        // We explicitly dispatch data to facilities as imp algo do not do it
        let data_unweighted: Vec<&Vec<T>> = self.data.iter().collect();
        let ids = (0..data_unweighted.len()).collect::<Vec<usize>>();
        facilities.dispatch_data(&data_unweighted, &ids, None);
        //
        facilities
    } // end of construct_centers
} // end of impl WeightedMettuPlaxton

#[cfg(test)]
mod tests {

    use super::*;
    use rand_distr::Normal;
    use rand_xoshiro::Xoshiro256PlusPlus;

    fn log_init_test() {
        let _ = env_logger::builder().is_test(true).try_init();
    }

    // generate data according to 2 gaussian distributions and random uniform weights sampled in intervals.
    fn generate_weighted_data(nbdata: usize) -> (Vec<Vec<f32>>, Vec<f32>) {
        //
        let dim = 50;
        let mut data = Vec::<Vec<f32>>::with_capacity(nbdata);
        let mut weights = Vec::<f32>::with_capacity(nbdata);
        //
        let mut rng = Xoshiro256PlusPlus::seed_from_u64(1454691);
        //
        // distributions
        //
        let n_mean1 = 2.;
        let n_sigma1 = 1.;
        let normal1 = Normal::new(n_mean1, n_sigma1).unwrap();
        let unif1 = Uniform::<f32>::new(0.5, 3.).unwrap();
        //
        let normal2 = Normal::new(n_mean1, n_sigma1).unwrap();
        let unif2 = Uniform::<f32>::new(0.5, 3.).unwrap();
        //
        // sample
        //
        let half = nbdata / 2;
        for _ in 0..half {
            let d_tmp: Vec<f32> = (0..dim).map(|_| normal1.sample(&mut rng)).collect();
            data.push(d_tmp);
            weights.push(unif1.sample(&mut rng));
        }

        for _ in (half + 1)..nbdata {
            let d_tmp: Vec<f32> = (0..dim).map(|_| normal2.sample(&mut rng)).collect();
            data.push(d_tmp);
            weights.push(unif2.sample(&mut rng));
        }
        // log::debug!("data : {:?}", data);
        // log::debug!("weights : {:?}", weights);
        //
        (data, weights)
    } // end of generate_weighted_data

    #[test]
    fn test_weight_mp() {
        log_init_test();
        //
        log::info!("in test_weight_mp");
        //
        let (data, weights) = generate_weighted_data(12);
        let distance = DistL2;
        //
        let wmp = WeightedMettuPlaxton::new(&data, &weights, distance);
        //
        let alfa = 1.;
        let _ = wmp.construct_centers(alfa);
    } // end of test_weight_mp
} // end of mod tests