mecomp_analysis/
clustering.rs

1//! this module contains helpers that wrap the a k-means crate to perform clustering on the data
2//! without having to choose an exact number of clusters.
3//!
4//! Instead, you provide the minimum and maximum number of clusters you want to try, and we'll
5//! use one of a range of methods to determine the optimal number of clusters.
6//!
7//! # References:
8//!
9//! - The gap statistic [R. Tibshirani, G. Walther, and T. Hastie (Standford University, 2001)](https://hastie.su.domains/Papers/gap.pdf)
10//! - The Davies-Bouldin index [wikipedia](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index)
11
12use linfa::prelude::*;
13use linfa_clustering::{GaussianMixtureModel, KMeans};
14use linfa_nn::distance::{Distance, L2Dist};
15use linfa_reduction::Pca;
16use linfa_tsne::TSneParams;
17use log::{debug, info};
18use ndarray::{Array, Array1, Array2, ArrayView1, ArrayView2, Axis, Dim};
19use ndarray_rand::RandomExt;
20use ndarray_stats::QuantileExt;
21use rand::distributions::Uniform;
22use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator};
23// use statrs::statistics::Statistics;
24
25use crate::{
26    DIM_EMBEDDING, Feature, NUMBER_FEATURES,
27    errors::{ClusteringError, ProjectionError},
28};
29
30pub type FitDataset = Dataset<Feature, (), Dim<[usize; 1]>>;
31
32#[derive(Clone, Copy, Debug)]
33#[allow(clippy::module_name_repetitions)]
34pub enum ClusteringMethod {
35    KMeans,
36    GaussianMixtureModel,
37}
38
39impl ClusteringMethod {
40    /// Fit the clustering method to the dataset and get the Labels
41    #[must_use]
42    fn fit(self, k: usize, data: &FitDataset) -> Array1<usize> {
43        match self {
44            Self::KMeans => {
45                let model = KMeans::params(k)
46                    // .max_n_iterations(MAX_ITERATIONS)
47                    .fit(data)
48                    .unwrap();
49                model.predict(data.records())
50            }
51            Self::GaussianMixtureModel => {
52                let model = GaussianMixtureModel::params(k)
53                    .init_method(linfa_clustering::GmmInitMethod::KMeans)
54                    .n_runs(10)
55                    .fit(data)
56                    .unwrap();
57                model.predict(data.records())
58            }
59        }
60    }
61}
62
63#[derive(Clone, Copy, Debug)]
64pub enum KOptimal {
65    GapStatistic {
66        /// The number of reference datasets to generate
67        b: u32,
68    },
69    DaviesBouldin,
70}
71
72#[derive(Clone, Copy, Debug, Default)]
73/// Should the data be projected into a lower-dimensional space before clustering, if so how?
74pub enum ProjectionMethod {
75    /// Use t-SNE to project the data into a lower-dimensional space
76    TSne,
77    /// Use PCA to project the data into a lower-dimensional space
78    Pca,
79    #[default]
80    /// Don't project the data
81    None,
82}
83
84impl ProjectionMethod {
85    /// Project the data into a lower-dimensional space
86    ///
87    /// # Errors
88    ///
89    /// Will return an error if there was an error projecting the data into a lower-dimensional space
90    #[inline]
91    pub fn project(self, samples: Array2<Feature>) -> Result<Array2<Feature>, ProjectionError> {
92        let result = match self {
93            Self::TSne => {
94                let nrecords = samples.nrows();
95                // first use the t-SNE algorithm to project the data into a lower-dimensional space
96                debug!("Generating embeddings (size: {EMBEDDING_SIZE}) using t-SNE");
97                #[allow(clippy::cast_precision_loss)]
98                let mut embeddings = TSneParams::embedding_size(EMBEDDING_SIZE)
99                    .perplexity(Feature::max(samples.nrows() as Feature / 20., 5.))
100                    .approx_threshold(0.5)
101                    .transform(samples)?;
102                debug_assert_eq!(embeddings.shape(), &[nrecords, EMBEDDING_SIZE]);
103
104                // normalize the embeddings so each dimension is between -1 and 1
105                debug!("Normalizing embeddings");
106                normalize_embeddings_inplace(&mut embeddings);
107                embeddings
108            }
109            Self::Pca => {
110                let nrecords = samples.nrows();
111                // use the PCA algorithm to project the data into a lower-dimensional space
112                debug!("Generating embeddings (size: {EMBEDDING_SIZE}) using PCA");
113                // linfa_reduction::pca::PCA only works for f64, see: https://github.com/rust-ml/linfa/issues/232
114                let data = Dataset::from(samples.mapv(f64::from));
115                let pca: Pca<f64> = Pca::params(EMBEDDING_SIZE).whiten(true).fit(&data)?;
116                #[allow(clippy::cast_possible_truncation)]
117                let mut embeddings = pca.predict(&data).mapv(|f| f as Feature);
118                debug_assert_eq!(embeddings.shape(), &[nrecords, EMBEDDING_SIZE]);
119
120                // normalize the embeddings so each dimension is between -1 and 1
121                debug!("Normalizing embeddings");
122                normalize_embeddings_inplace(&mut embeddings);
123                embeddings
124            }
125            Self::None => {
126                debug!("Using original data as embeddings");
127                samples
128            }
129        };
130        debug!("Embeddings shape: {:?}", result.shape());
131        Ok(result)
132    }
133}
134
135// Normalize the embeddings to between 0.0 and 1.0, in-place.
136// Pass the embedding size as an argument to enable more compiler optimizations
137fn normalize_embeddings_inplace(embeddings: &mut Array2<Feature>) {
138    for i in 0..embeddings.ncols() {
139        let min = embeddings.column(i).min().copied().unwrap_or_default();
140        let max = embeddings.column(i).max().copied().unwrap_or_default();
141        let range = max - min;
142        embeddings
143            .column_mut(i)
144            .mapv_inplace(|v| ((v - min) / range).mul_add(2., -1.));
145    }
146}
147
148// log the number of features
149/// Dimensionality that the T-SNE and PCA projection methods aim to project the data into.
150const EMBEDDING_SIZE: usize = {
151    let log2 = usize::ilog2(if DIM_EMBEDDING < NUMBER_FEATURES {
152        NUMBER_FEATURES
153    } else {
154        DIM_EMBEDDING
155    }) as usize;
156    if log2 < 2 { 2 } else { log2 }
157};
158
159#[allow(clippy::module_name_repetitions)]
160pub struct ClusteringHelper<S>
161where
162    S: Sized,
163{
164    state: S,
165}
166
167pub struct EntryPoint;
168pub struct NotInitialized {
169    /// The embeddings of our input, as a Nx`EMBEDDING_SIZE` array
170    embeddings: Array2<Feature>,
171    pub k_max: usize,
172    pub optimizer: KOptimal,
173    pub clustering_method: ClusteringMethod,
174}
175pub struct Initialized {
176    /// The embeddings of our input, as a Nx`EMBEDDING_SIZE` array
177    embeddings: Array2<Feature>,
178    pub k: usize,
179    pub clustering_method: ClusteringMethod,
180}
181pub struct Finished {
182    /// The labelings of the samples, as a Nx1 array.
183    /// Each element is the cluster that the corresponding sample belongs to.
184    labels: Array1<usize>,
185    pub k: usize,
186}
187
188/// Functions available for all states
189impl ClusteringHelper<EntryPoint> {
190    /// Create a new `KMeansHelper` object
191    ///
192    /// # Errors
193    ///
194    /// Will return an error if there was an error projecting the data into a lower-dimensional space
195    #[allow(clippy::missing_inline_in_public_items)]
196    pub fn new(
197        samples: Array2<Feature>,
198        k_max: usize,
199        optimizer: KOptimal,
200        clustering_method: ClusteringMethod,
201        projection_method: ProjectionMethod,
202    ) -> Result<ClusteringHelper<NotInitialized>, ClusteringError> {
203        if samples.nrows() <= 15 {
204            return Err(ClusteringError::SmallLibrary);
205        }
206
207        // project the data into a lower-dimensional space
208        let embeddings = projection_method.project(samples)?;
209
210        Ok(ClusteringHelper {
211            state: NotInitialized {
212                embeddings,
213                k_max,
214                optimizer,
215                clustering_method,
216            },
217        })
218    }
219}
220
221/// Functions available for `NotInitialized` state
222impl ClusteringHelper<NotInitialized> {
223    /// Initialize the `KMeansHelper` object
224    ///
225    /// # Errors
226    ///
227    /// Will return an error if there was an error calculating the optimal number of clusters
228    #[inline]
229    pub fn initialize(self) -> Result<ClusteringHelper<Initialized>, ClusteringError> {
230        let k = self.get_optimal_k()?;
231        Ok(ClusteringHelper {
232            state: Initialized {
233                embeddings: self.state.embeddings,
234                k,
235                clustering_method: self.state.clustering_method,
236            },
237        })
238    }
239
240    fn get_optimal_k(&self) -> Result<usize, ClusteringError> {
241        match self.state.optimizer {
242            KOptimal::GapStatistic { b } => self.get_optimal_k_gap_statistic(b),
243            KOptimal::DaviesBouldin => self.get_optimal_k_davies_bouldin(),
244        }
245    }
246
247    /// Get the optimal number of clusters using the gap statistic
248    ///
249    /// # References:
250    ///
251    /// - [R. Tibshirani, G. Walther, and T. Hastie (Standford University, 2001)](https://hastie.su.domains/Papers/gap.pdf)
252    ///
253    /// # Algorithm:
254    ///
255    /// 1. Cluster the observed data, varying the number of clusters from k = 1, …, kmax, and compute the corresponding total within intra-cluster variation Wk.
256    /// 2. Generate B reference data sets with a random uniform distribution. Cluster each of these reference data sets with varying number of clusters k = 1, …, kmax,
257    ///    and compute the corresponding total within intra-cluster variation `W_{kb}`.
258    /// 3. Compute the estimated gap statistic as the deviation of the observed `W_k` value from its expected value `W_kb` under the null hypothesis:
259    ///    `Gap(k)=(1/B) \sum_{b=1}^{B} \log(W^*_{kb}) − \log(W_k)`.
260    ///    Compute also the standard deviation of the statistics.
261    /// 4. Choose the number of clusters as the smallest value of k such that the gap statistic is within one standard deviation of the gap at k+1:
262    ///    `Gap(k)≥Gap(k + 1)−s_{k + 1}`.
263    fn get_optimal_k_gap_statistic(&self, b: u32) -> Result<usize, ClusteringError> {
264        let embedding_dataset = Dataset::from(self.state.embeddings.clone());
265
266        // our reference data sets
267        let reference_datasets =
268            generate_reference_datasets(embedding_dataset.records().view(), b as usize);
269
270        #[allow(clippy::cast_precision_loss)]
271        let b = b as Feature;
272
273        let results = (1..=self.state.k_max)
274            // for each k, cluster the data into k clusters
275            .map(|k| {
276                debug!("Fitting k-means to embeddings with k={k}");
277                let labels = self.state.clustering_method.fit(k, &embedding_dataset);
278                (k, labels)
279            })
280            // for each k, calculate the gap statistic, and the standard deviation of the statistics
281            .map(|(k, labels)| {
282                // calculate the within intra-cluster variation for the reference data sets
283                debug!(
284                    "Calculating within intra-cluster variation for reference data sets with k={k}"
285                );
286                let w_kb_log: Vec<_> = reference_datasets
287                    .par_iter()
288                    .map(|ref_data| {
289                        // cluster the reference data into k clusters
290                        let ref_labels = self.state.clustering_method.fit(k, ref_data);
291                        // calculate the within intra-cluster variation for the reference data
292                        let ref_pairwise_distances = calc_pairwise_distances(
293                            ref_data.records().view(),
294                            k,
295                            ref_labels.view(),
296                        );
297                        calc_within_dispersion(ref_labels.view(), k, ref_pairwise_distances.view())
298                            .log2()
299                    })
300                    .collect();
301                let w_kb_log = Array::from_vec(w_kb_log);
302
303                // calculate the within intra-cluster variation for the observed data
304                let pairwise_distances =
305                    calc_pairwise_distances(self.state.embeddings.view(), k, labels.view());
306                let w_k = calc_within_dispersion(labels.view(), k, pairwise_distances.view());
307
308                // finally, calculate the gap statistic
309                let w_kb_log_sum: Feature = w_kb_log.sum();
310                // original formula: l = (1 / B) * sum_b(log(W_kb))
311                let l = b.recip() * w_kb_log_sum;
312                // original formula: gap_k = (1 / B) * sum_b(log(W_kb)) - log(W_k)
313                let gap_k = l - w_k.log2();
314                // original formula: sd_k = [(1 / B) * sum_b((log(W_kb) - l)^2)]^0.5
315                let standard_deviation = (b.recip() * (w_kb_log - l).pow2().sum()).sqrt();
316                // original formula: s_k = sd_k * (1 + 1 / B)^0.5
317                // calculate differently to minimize rounding errors
318                let s_k = standard_deviation * (1.0 + b.recip()).sqrt();
319
320                (k, gap_k, s_k)
321            });
322
323        // // plot the gap_k (whisker with s_k) w.r.t. k
324        // #[cfg(feature = "plot_gap")]
325        // plot_gap_statistic(results.clone().collect::<Vec<_>>());
326
327        // finally, consume the results iterator until we find the optimal k
328        let (mut optimal_k, mut gap_k_minus_one) = (None, None);
329        for (k, gap_k, s_k) in results {
330            info!("k: {k}, gap_k: {gap_k}, s_k: {s_k}");
331
332            if let Some(gap_k_minus_one) = gap_k_minus_one
333                && gap_k_minus_one >= gap_k - s_k
334            {
335                info!("Optimal k found: {}", k - 1);
336                optimal_k = Some(k - 1);
337                break;
338            }
339
340            gap_k_minus_one = Some(gap_k);
341        }
342
343        optimal_k.ok_or(ClusteringError::OptimalKNotFound(self.state.k_max))
344    }
345
346    fn get_optimal_k_davies_bouldin(&self) -> Result<usize, ClusteringError> {
347        todo!();
348    }
349}
350
351/// Generate B reference data sets with a random uniform distribution
352///
353/// (excerpt from reference paper)
354/// """
355/// We consider two choices for the reference distribution:
356///
357/// 1. generate each reference feature uniformly over the range of the observed values for that feature.
358/// 2. generate the reference features from a uniform distribution over a box aligned with the
359///    principle components of the data.
360///    In detail, if X is our n by p data matrix, we assume that the columns have mean 0 and compute
361///    the singular value decomposition X = UDV^T. We transform via X' = XV and then draw uniform features Z'
362///    over the ranges of the columns of X', as in method (1) above.
363///    Finally, we back-transform via Z=Z'V^T to give reference data Z.
364///
365/// Method (1) has the advantage of simplicity. Method (2) takes into account the shape of the
366/// data distribution and makes the procedure rotationally invariant, as long as the
367/// clustering method itself is invariant
368/// """
369///
370/// For now, we will use method (1) as it is simpler to implement
371/// and we know that our data is already normalized and that
372/// the ordering of features is important, meaning that we can't
373/// rotate the data anyway.
374fn generate_reference_datasets(samples: ArrayView2<'_, Feature>, b: usize) -> Vec<FitDataset> {
375    (0..b)
376        .into_par_iter()
377        .map(|_| Dataset::from(generate_ref_single(samples.view())))
378        .collect()
379}
380fn generate_ref_single(samples: ArrayView2<'_, Feature>) -> Array2<Feature> {
381    let feature_distributions = samples
382        .axis_iter(Axis(1))
383        .map(|feature| {
384            Array::random(
385                feature.dim(),
386                Uniform::new(
387                    feature.min().copied().unwrap_or_default(),
388                    feature.max().copied().unwrap_or_default(),
389                ),
390            )
391        })
392        .collect::<Vec<_>>();
393    let feature_dists_views = feature_distributions
394        .iter()
395        .map(ndarray::ArrayBase::view)
396        .collect::<Vec<_>>();
397    ndarray::stack(Axis(0), &feature_dists_views)
398        .unwrap()
399        .t()
400        .to_owned()
401}
402
403/// Calculate `W_k`, the within intra-cluster variation for the given clustering
404///
405/// `W_k = \sum_{r=1}^{k} \frac{D_r}{2*n_r}`
406fn calc_within_dispersion(
407    labels: ArrayView1<'_, usize>,
408    k: usize,
409    pairwise_distances: ArrayView1<'_, Feature>,
410) -> Feature {
411    debug_assert_eq!(k, labels.iter().max().unwrap() + 1);
412
413    // we first need to convert our list of labels into a list of the number of samples in each cluster
414    let counts = labels.iter().fold(vec![0u32; k], |mut counts, &label| {
415        counts[label] += 1;
416        counts
417    });
418    // then, we calculate the within intra-cluster variation
419    #[allow(clippy::cast_precision_loss)]
420    counts
421        .iter()
422        .zip(pairwise_distances.iter())
423        .map(|(&count, distance)| (2.0 * count as Feature).recip() * distance)
424        .sum()
425}
426
427/// Calculate the `D_r` array, the sum of the pairwise distances in cluster r, for all clusters in the given clustering
428///
429/// # Arguments
430///
431/// - `samples`: The samples in the dataset
432/// - `k`: The number of clusters
433/// - `labels`: The cluster labels for each sample
434fn calc_pairwise_distances(
435    samples: ArrayView2<'_, Feature>,
436    k: usize,
437    labels: ArrayView1<'_, usize>,
438) -> Array1<Feature> {
439    debug_assert_eq!(
440        samples.nrows(),
441        labels.len(),
442        "Samples and labels must have the same length"
443    );
444    debug_assert_eq!(
445        k,
446        labels.iter().max().unwrap() + 1,
447        "Labels must be in the range 0..k"
448    );
449
450    // for each cluster, calculate the sum of the pairwise distances between samples in that cluster
451    let mut distances = Array1::zeros(k);
452    let mut clusters = vec![Vec::new(); k];
453    // build clusters
454    for (sample, label) in samples.outer_iter().zip(labels.iter()) {
455        clusters[*label].push(sample);
456    }
457    // calculate pairwise dist. within each cluster
458    for (k, cluster) in clusters.iter().enumerate() {
459        let mut pairwise_dists = 0.;
460        for i in 0..cluster.len() - 1 {
461            let a = cluster[i];
462            let rest = &cluster[i + 1..];
463            for &b in rest {
464                pairwise_dists += L2Dist.distance(a, b);
465            }
466        }
467        distances[k] += pairwise_dists + pairwise_dists;
468    }
469    distances
470}
471
472/// Functions available for Initialized state
473impl ClusteringHelper<Initialized> {
474    /// Cluster the data into k clusters
475    ///
476    /// # Errors
477    ///
478    /// Will return an error if the clustering fails
479    #[must_use]
480    #[inline]
481    pub fn cluster(self) -> ClusteringHelper<Finished> {
482        let Initialized {
483            clustering_method,
484            embeddings,
485            k,
486        } = self.state;
487
488        let embedding_dataset = Dataset::from(embeddings);
489        let labels = clustering_method.fit(k, &embedding_dataset);
490
491        ClusteringHelper {
492            state: Finished { labels, k },
493        }
494    }
495}
496
497/// Functions available for Finished state
498impl ClusteringHelper<Finished> {
499    /// use the labels to reorganize the provided samples into clusters
500    #[must_use]
501    #[inline]
502    pub fn extract_analysis_clusters<T: Clone>(&self, samples: Vec<T>) -> Vec<Vec<T>> {
503        let mut clusters = vec![Vec::new(); self.state.k];
504
505        for (sample, &label) in samples.into_iter().zip(self.state.labels.iter()) {
506            clusters[label].push(sample);
507        }
508
509        clusters
510    }
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516    use ndarray::{arr1, arr2, s};
517    use ndarray_rand::rand_distr::StandardNormal;
518    use pretty_assertions::assert_eq;
519    use rstest::rstest;
520
521    #[test]
522    fn test_generate_reference_data_set() {
523        let data = arr2(&[[10.0, -10.0], [20.0, -20.0], [30.0, -30.0]]);
524
525        let ref_data = generate_ref_single(data.view());
526
527        // First column all vals between 10.0 and 30.0
528        assert!(
529            ref_data
530                .slice(s![.., 0])
531                .iter()
532                .all(|v| *v >= 10.0 && *v <= 30.0)
533        );
534
535        // Second column all vals between -10.0 and -30.0
536        assert!(
537            ref_data
538                .slice(s![.., 1])
539                .iter()
540                .all(|v| *v <= -10.0 && *v >= -30.0)
541        );
542
543        // check that the shape is correct
544        assert_eq!(ref_data.shape(), data.shape());
545
546        // check that the data is not the same as the original data
547        assert_ne!(ref_data, data);
548    }
549
550    #[test]
551    fn test_pairwise_distances() {
552        let samples = arr2(&[[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.0, 2.0]]);
553        let labels = arr1(&[0, 0, 1, 1]);
554
555        let pairwise_distances = calc_pairwise_distances(samples.view(), 2, labels.view());
556
557        assert!(
558            f32::EPSILON > (pairwise_distances[0] - 0.0).abs(),
559            "{} != 0.0",
560            pairwise_distances[0]
561        );
562        assert!(
563            f32::EPSILON > (pairwise_distances[1] - 0.0).abs(),
564            "{} != 0.0",
565            pairwise_distances[1]
566        );
567
568        let samples = arr2(&[[1.0, 2.0], [1.0, 1.0], [2.0, 2.0], [2.0, 3.0]]);
569
570        let pairwise_distances = calc_pairwise_distances(samples.view(), 2, labels.view());
571
572        assert!(
573            f32::EPSILON > (pairwise_distances[0] - 2.0).abs(),
574            "{} != 2.0",
575            pairwise_distances[0]
576        );
577        assert!(
578            f32::EPSILON > (pairwise_distances[1] - 2.0).abs(),
579            "{} != 2.0",
580            pairwise_distances[1]
581        );
582    }
583
584    #[test]
585    fn test_calc_within_dispersion() {
586        let labels = arr1(&[0, 1, 0, 1]);
587        let pairwise_distances = arr1(&[1.0, 2.0]);
588        let result = calc_within_dispersion(labels.view(), 2, pairwise_distances.view());
589
590        // `W_k = \sum_{r=1}^{k} \frac{D_r}{2*n_r}` = 1/4 * 1.0 + 1/4 * 2.0 = 0.25 + 0.5 = 0.75
591        assert!(f32::EPSILON > (result - 0.75).abs(), "{result} != 0.75");
592    }
593
594    #[rstest]
595    #[case::project_none(ProjectionMethod::None, NUMBER_FEATURES)]
596    #[case::project_tsne(ProjectionMethod::TSne, EMBEDDING_SIZE)]
597    #[case::project_pca(ProjectionMethod::Pca, EMBEDDING_SIZE)]
598    fn test_project(
599        #[case] projection_method: ProjectionMethod,
600        #[case] expected_embedding_size: usize,
601    ) {
602        // generate 100 random samples, we use a normal distribution because with a uniform distribution
603        // the data has no real "principle components" and PCA will not work as expected since almost all the eigenvalues
604        // with fall below the cutoff
605        let mut samples = Array2::random((100, NUMBER_FEATURES), StandardNormal);
606        normalize_embeddings_inplace(&mut samples);
607
608        let result = projection_method.project(samples).unwrap();
609
610        // ensure embeddings are the correct shape
611        assert_eq!(result.shape(), &[100, expected_embedding_size]);
612
613        // ensure the data is normalized
614        for i in 0..expected_embedding_size {
615            let min = result.column(i).min().copied().unwrap_or_default();
616            let max = result.column(i).max().copied().unwrap_or_default();
617            assert!(
618                f32::EPSILON > (min + 1.0).abs(),
619                "Min value of column {i} is not -1.0: {min}",
620            );
621            assert!(
622                f32::EPSILON > (max - 1.0).abs(),
623                "Max value of column {i} is not 1.0: {max}",
624            );
625        }
626    }
627}
628
629// #[cfg(feature = "plot_gap")]
630// fn plot_gap_statistic(data: Vec<(usize, f64, f64)>) {
631//     use plotters::prelude::*;
632
633//     // Assuming data is a Vec<(usize, f64, f64)> of (k, gap_k, s_k)
634//     let root_area = BitMapBackend::new("gap_statistic_plot.png", (640, 480)).into_drawing_area();
635//     root_area.fill(&WHITE).unwrap();
636
637//     let max_gap_k = data
638//         .iter()
639//         .map(|(_, gap_k, _)| *gap_k)
640//         .fold(f64::MIN, f64::max);
641//     let min_gap_k = data
642//         .iter()
643//         .map(|(_, gap_k, _)| *gap_k)
644//         .fold(f64::MAX, f64::min);
645//     let max_k = data.iter().map(|(k, _, _)| *k).max().unwrap_or(0);
646
647//     let mut chart = ChartBuilder::on(&root_area)
648//         .caption("Gap Statistic Plot", ("sans-serif", 30))
649//         .margin(5)
650//         .x_label_area_size(30)
651//         .y_label_area_size(30)
652//         .build_cartesian_2d(0..max_k, min_gap_k..max_gap_k)
653//         .unwrap();
654
655//     chart.configure_mesh().draw().unwrap();
656
657//     for (k, gap_k, s_k) in data {
658//         chart
659//             .draw_series(PointSeries::of_element(
660//                 vec![(k, gap_k)],
661//                 5,
662//                 &RED,
663//                 &|coord, size, style| {
664//                     EmptyElement::at(coord) + Circle::new((0, 0), size, style.filled())
665//                 },
666//             ))
667//             .unwrap();
668
669//         // Drawing error bars
670//         chart
671//             .draw_series(LineSeries::new(
672//                 vec![(k, gap_k - s_k), (k, gap_k + s_k)],
673//                 &BLACK,
674//             ))
675//             .unwrap();
676//     }
677
678//     root_area.present().unwrap();
679// }
mecomp_analysis/clustering.rs

mecomp_analysis/
clustering.rs