mecomp_analysis/clustering.rs
1//! this module contains helpers that wrap the a k-means crate to perform clustering on the data
2//! without having to choose an exact number of clusters.
3//!
4//! Instead, you provide the minimum and maximum number of clusters you want to try, and we'll
5//! use one of a range of methods to determine the optimal number of clusters.
6//!
7//! # References:
8//!
9//! - The gap statistic [R. Tibshirani, G. Walther, and T. Hastie (Standford University, 2001)](https://hastie.su.domains/Papers/gap.pdf)
10//! - The Davies-Bouldin index [wikipedia](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index)
11
12use linfa::prelude::*;
13use linfa_clustering::{GaussianMixtureModel, KMeans};
14use linfa_nn::distance::{Distance, L2Dist};
15use linfa_reduction::Pca;
16use linfa_tsne::TSneParams;
17use log::{debug, info};
18use ndarray::{Array, Array1, Array2, ArrayView1, ArrayView2, Axis, Dim};
19use ndarray_rand::RandomExt;
20use ndarray_stats::QuantileExt;
21use rand::distributions::Uniform;
22use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator};
23// use statrs::statistics::Statistics;
24
25use crate::{
26 DIM_EMBEDDING, Feature, NUMBER_FEATURES,
27 errors::{ClusteringError, ProjectionError},
28};
29
30pub type FitDataset = Dataset<Feature, (), Dim<[usize; 1]>>;
31
32#[derive(Clone, Copy, Debug)]
33#[allow(clippy::module_name_repetitions)]
34pub enum ClusteringMethod {
35 KMeans,
36 GaussianMixtureModel,
37}
38
39impl ClusteringMethod {
40 /// Fit the clustering method to the dataset and get the Labels
41 #[must_use]
42 fn fit(self, k: usize, data: &FitDataset) -> Array1<usize> {
43 match self {
44 Self::KMeans => {
45 let model = KMeans::params(k)
46 // .max_n_iterations(MAX_ITERATIONS)
47 .fit(data)
48 .unwrap();
49 model.predict(data.records())
50 }
51 Self::GaussianMixtureModel => {
52 let model = GaussianMixtureModel::params(k)
53 .init_method(linfa_clustering::GmmInitMethod::KMeans)
54 .n_runs(10)
55 .fit(data)
56 .unwrap();
57 model.predict(data.records())
58 }
59 }
60 }
61}
62
63#[derive(Clone, Copy, Debug)]
64pub enum KOptimal {
65 GapStatistic {
66 /// The number of reference datasets to generate
67 b: u32,
68 },
69 DaviesBouldin,
70}
71
72#[derive(Clone, Copy, Debug, Default)]
73/// Should the data be projected into a lower-dimensional space before clustering, if so how?
74pub enum ProjectionMethod {
75 /// Use t-SNE to project the data into a lower-dimensional space
76 TSne,
77 /// Use PCA to project the data into a lower-dimensional space
78 Pca,
79 #[default]
80 /// Don't project the data
81 None,
82}
83
84impl ProjectionMethod {
85 /// Project the data into a lower-dimensional space
86 ///
87 /// # Errors
88 ///
89 /// Will return an error if there was an error projecting the data into a lower-dimensional space
90 #[inline]
91 pub fn project(self, samples: Array2<Feature>) -> Result<Array2<Feature>, ProjectionError> {
92 let result = match self {
93 Self::TSne => {
94 let nrecords = samples.nrows();
95 // first use the t-SNE algorithm to project the data into a lower-dimensional space
96 debug!("Generating embeddings (size: {EMBEDDING_SIZE}) using t-SNE");
97 #[allow(clippy::cast_precision_loss)]
98 let mut embeddings = TSneParams::embedding_size(EMBEDDING_SIZE)
99 .perplexity(Feature::max(samples.nrows() as Feature / 20., 5.))
100 .approx_threshold(0.5)
101 .transform(samples)?;
102 debug_assert_eq!(embeddings.shape(), &[nrecords, EMBEDDING_SIZE]);
103
104 // normalize the embeddings so each dimension is between -1 and 1
105 debug!("Normalizing embeddings");
106 normalize_embeddings_inplace(&mut embeddings);
107 embeddings
108 }
109 Self::Pca => {
110 let nrecords = samples.nrows();
111 // use the PCA algorithm to project the data into a lower-dimensional space
112 debug!("Generating embeddings (size: {EMBEDDING_SIZE}) using PCA");
113 // linfa_reduction::pca::PCA only works for f64, see: https://github.com/rust-ml/linfa/issues/232
114 let data = Dataset::from(samples.mapv(f64::from));
115 let pca: Pca<f64> = Pca::params(EMBEDDING_SIZE).whiten(true).fit(&data)?;
116 #[allow(clippy::cast_possible_truncation)]
117 let mut embeddings = pca.predict(&data).mapv(|f| f as Feature);
118 debug_assert_eq!(embeddings.shape(), &[nrecords, EMBEDDING_SIZE]);
119
120 // normalize the embeddings so each dimension is between -1 and 1
121 debug!("Normalizing embeddings");
122 normalize_embeddings_inplace(&mut embeddings);
123 embeddings
124 }
125 Self::None => {
126 debug!("Using original data as embeddings");
127 samples
128 }
129 };
130 debug!("Embeddings shape: {:?}", result.shape());
131 Ok(result)
132 }
133}
134
135// Normalize the embeddings to between 0.0 and 1.0, in-place.
136// Pass the embedding size as an argument to enable more compiler optimizations
137fn normalize_embeddings_inplace(embeddings: &mut Array2<Feature>) {
138 for i in 0..embeddings.ncols() {
139 let min = embeddings.column(i).min().copied().unwrap_or_default();
140 let max = embeddings.column(i).max().copied().unwrap_or_default();
141 let range = max - min;
142 embeddings
143 .column_mut(i)
144 .mapv_inplace(|v| ((v - min) / range).mul_add(2., -1.));
145 }
146}
147
148// log the number of features
149/// Dimensionality that the T-SNE and PCA projection methods aim to project the data into.
150const EMBEDDING_SIZE: usize = {
151 let log2 = usize::ilog2(if DIM_EMBEDDING < NUMBER_FEATURES {
152 NUMBER_FEATURES
153 } else {
154 DIM_EMBEDDING
155 }) as usize;
156 if log2 < 2 { 2 } else { log2 }
157};
158
159#[allow(clippy::module_name_repetitions)]
160pub struct ClusteringHelper<S>
161where
162 S: Sized,
163{
164 state: S,
165}
166
167pub struct EntryPoint;
168pub struct NotInitialized {
169 /// The embeddings of our input, as a Nx`EMBEDDING_SIZE` array
170 embeddings: Array2<Feature>,
171 pub k_max: usize,
172 pub optimizer: KOptimal,
173 pub clustering_method: ClusteringMethod,
174}
175pub struct Initialized {
176 /// The embeddings of our input, as a Nx`EMBEDDING_SIZE` array
177 embeddings: Array2<Feature>,
178 pub k: usize,
179 pub clustering_method: ClusteringMethod,
180}
181pub struct Finished {
182 /// The labelings of the samples, as a Nx1 array.
183 /// Each element is the cluster that the corresponding sample belongs to.
184 labels: Array1<usize>,
185 pub k: usize,
186}
187
188/// Functions available for all states
189impl ClusteringHelper<EntryPoint> {
190 /// Create a new `KMeansHelper` object
191 ///
192 /// # Errors
193 ///
194 /// Will return an error if there was an error projecting the data into a lower-dimensional space
195 #[allow(clippy::missing_inline_in_public_items)]
196 pub fn new(
197 samples: Array2<Feature>,
198 k_max: usize,
199 optimizer: KOptimal,
200 clustering_method: ClusteringMethod,
201 projection_method: ProjectionMethod,
202 ) -> Result<ClusteringHelper<NotInitialized>, ClusteringError> {
203 if samples.nrows() <= 15 {
204 return Err(ClusteringError::SmallLibrary);
205 }
206
207 // project the data into a lower-dimensional space
208 let embeddings = projection_method.project(samples)?;
209
210 Ok(ClusteringHelper {
211 state: NotInitialized {
212 embeddings,
213 k_max,
214 optimizer,
215 clustering_method,
216 },
217 })
218 }
219}
220
221/// Functions available for `NotInitialized` state
222impl ClusteringHelper<NotInitialized> {
223 /// Initialize the `KMeansHelper` object
224 ///
225 /// # Errors
226 ///
227 /// Will return an error if there was an error calculating the optimal number of clusters
228 #[inline]
229 pub fn initialize(self) -> Result<ClusteringHelper<Initialized>, ClusteringError> {
230 let k = self.get_optimal_k()?;
231 Ok(ClusteringHelper {
232 state: Initialized {
233 embeddings: self.state.embeddings,
234 k,
235 clustering_method: self.state.clustering_method,
236 },
237 })
238 }
239
240 fn get_optimal_k(&self) -> Result<usize, ClusteringError> {
241 match self.state.optimizer {
242 KOptimal::GapStatistic { b } => self.get_optimal_k_gap_statistic(b),
243 KOptimal::DaviesBouldin => self.get_optimal_k_davies_bouldin(),
244 }
245 }
246
247 /// Get the optimal number of clusters using the gap statistic
248 ///
249 /// # References:
250 ///
251 /// - [R. Tibshirani, G. Walther, and T. Hastie (Standford University, 2001)](https://hastie.su.domains/Papers/gap.pdf)
252 ///
253 /// # Algorithm:
254 ///
255 /// 1. Cluster the observed data, varying the number of clusters from k = 1, …, kmax, and compute the corresponding total within intra-cluster variation Wk.
256 /// 2. Generate B reference data sets with a random uniform distribution. Cluster each of these reference data sets with varying number of clusters k = 1, …, kmax,
257 /// and compute the corresponding total within intra-cluster variation `W_{kb}`.
258 /// 3. Compute the estimated gap statistic as the deviation of the observed `W_k` value from its expected value `W_kb` under the null hypothesis:
259 /// `Gap(k)=(1/B) \sum_{b=1}^{B} \log(W^*_{kb}) − \log(W_k)`.
260 /// Compute also the standard deviation of the statistics.
261 /// 4. Choose the number of clusters as the smallest value of k such that the gap statistic is within one standard deviation of the gap at k+1:
262 /// `Gap(k)≥Gap(k + 1)−s_{k + 1}`.
263 fn get_optimal_k_gap_statistic(&self, b: u32) -> Result<usize, ClusteringError> {
264 let embedding_dataset = Dataset::from(self.state.embeddings.clone());
265
266 // our reference data sets
267 let reference_datasets =
268 generate_reference_datasets(embedding_dataset.records().view(), b as usize);
269
270 #[allow(clippy::cast_precision_loss)]
271 let b = b as Feature;
272
273 let results = (1..=self.state.k_max)
274 // for each k, cluster the data into k clusters
275 .map(|k| {
276 debug!("Fitting k-means to embeddings with k={k}");
277 let labels = self.state.clustering_method.fit(k, &embedding_dataset);
278 (k, labels)
279 })
280 // for each k, calculate the gap statistic, and the standard deviation of the statistics
281 .map(|(k, labels)| {
282 // calculate the within intra-cluster variation for the reference data sets
283 debug!(
284 "Calculating within intra-cluster variation for reference data sets with k={k}"
285 );
286 let w_kb_log: Vec<_> = reference_datasets
287 .par_iter()
288 .map(|ref_data| {
289 // cluster the reference data into k clusters
290 let ref_labels = self.state.clustering_method.fit(k, ref_data);
291 // calculate the within intra-cluster variation for the reference data
292 let ref_pairwise_distances = calc_pairwise_distances(
293 ref_data.records().view(),
294 k,
295 ref_labels.view(),
296 );
297 calc_within_dispersion(ref_labels.view(), k, ref_pairwise_distances.view())
298 .log2()
299 })
300 .collect();
301 let w_kb_log = Array::from_vec(w_kb_log);
302
303 // calculate the within intra-cluster variation for the observed data
304 let pairwise_distances =
305 calc_pairwise_distances(self.state.embeddings.view(), k, labels.view());
306 let w_k = calc_within_dispersion(labels.view(), k, pairwise_distances.view());
307
308 // finally, calculate the gap statistic
309 let w_kb_log_sum: Feature = w_kb_log.sum();
310 // original formula: l = (1 / B) * sum_b(log(W_kb))
311 let l = b.recip() * w_kb_log_sum;
312 // original formula: gap_k = (1 / B) * sum_b(log(W_kb)) - log(W_k)
313 let gap_k = l - w_k.log2();
314 // original formula: sd_k = [(1 / B) * sum_b((log(W_kb) - l)^2)]^0.5
315 let standard_deviation = (b.recip() * (w_kb_log - l).pow2().sum()).sqrt();
316 // original formula: s_k = sd_k * (1 + 1 / B)^0.5
317 // calculate differently to minimize rounding errors
318 let s_k = standard_deviation * (1.0 + b.recip()).sqrt();
319
320 (k, gap_k, s_k)
321 });
322
323 // // plot the gap_k (whisker with s_k) w.r.t. k
324 // #[cfg(feature = "plot_gap")]
325 // plot_gap_statistic(results.clone().collect::<Vec<_>>());
326
327 // finally, consume the results iterator until we find the optimal k
328 let (mut optimal_k, mut gap_k_minus_one) = (None, None);
329 for (k, gap_k, s_k) in results {
330 info!("k: {k}, gap_k: {gap_k}, s_k: {s_k}");
331
332 if let Some(gap_k_minus_one) = gap_k_minus_one
333 && gap_k_minus_one >= gap_k - s_k
334 {
335 info!("Optimal k found: {}", k - 1);
336 optimal_k = Some(k - 1);
337 break;
338 }
339
340 gap_k_minus_one = Some(gap_k);
341 }
342
343 optimal_k.ok_or(ClusteringError::OptimalKNotFound(self.state.k_max))
344 }
345
346 fn get_optimal_k_davies_bouldin(&self) -> Result<usize, ClusteringError> {
347 todo!();
348 }
349}
350
351/// Generate B reference data sets with a random uniform distribution
352///
353/// (excerpt from reference paper)
354/// """
355/// We consider two choices for the reference distribution:
356///
357/// 1. generate each reference feature uniformly over the range of the observed values for that feature.
358/// 2. generate the reference features from a uniform distribution over a box aligned with the
359/// principle components of the data.
360/// In detail, if X is our n by p data matrix, we assume that the columns have mean 0 and compute
361/// the singular value decomposition X = UDV^T. We transform via X' = XV and then draw uniform features Z'
362/// over the ranges of the columns of X', as in method (1) above.
363/// Finally, we back-transform via Z=Z'V^T to give reference data Z.
364///
365/// Method (1) has the advantage of simplicity. Method (2) takes into account the shape of the
366/// data distribution and makes the procedure rotationally invariant, as long as the
367/// clustering method itself is invariant
368/// """
369///
370/// For now, we will use method (1) as it is simpler to implement
371/// and we know that our data is already normalized and that
372/// the ordering of features is important, meaning that we can't
373/// rotate the data anyway.
374fn generate_reference_datasets(samples: ArrayView2<'_, Feature>, b: usize) -> Vec<FitDataset> {
375 (0..b)
376 .into_par_iter()
377 .map(|_| Dataset::from(generate_ref_single(samples.view())))
378 .collect()
379}
380fn generate_ref_single(samples: ArrayView2<'_, Feature>) -> Array2<Feature> {
381 let feature_distributions = samples
382 .axis_iter(Axis(1))
383 .map(|feature| {
384 Array::random(
385 feature.dim(),
386 Uniform::new(
387 feature.min().copied().unwrap_or_default(),
388 feature.max().copied().unwrap_or_default(),
389 ),
390 )
391 })
392 .collect::<Vec<_>>();
393 let feature_dists_views = feature_distributions
394 .iter()
395 .map(ndarray::ArrayBase::view)
396 .collect::<Vec<_>>();
397 ndarray::stack(Axis(0), &feature_dists_views)
398 .unwrap()
399 .t()
400 .to_owned()
401}
402
403/// Calculate `W_k`, the within intra-cluster variation for the given clustering
404///
405/// `W_k = \sum_{r=1}^{k} \frac{D_r}{2*n_r}`
406fn calc_within_dispersion(
407 labels: ArrayView1<'_, usize>,
408 k: usize,
409 pairwise_distances: ArrayView1<'_, Feature>,
410) -> Feature {
411 debug_assert_eq!(k, labels.iter().max().unwrap() + 1);
412
413 // we first need to convert our list of labels into a list of the number of samples in each cluster
414 let counts = labels.iter().fold(vec![0u32; k], |mut counts, &label| {
415 counts[label] += 1;
416 counts
417 });
418 // then, we calculate the within intra-cluster variation
419 #[allow(clippy::cast_precision_loss)]
420 counts
421 .iter()
422 .zip(pairwise_distances.iter())
423 .map(|(&count, distance)| (2.0 * count as Feature).recip() * distance)
424 .sum()
425}
426
427/// Calculate the `D_r` array, the sum of the pairwise distances in cluster r, for all clusters in the given clustering
428///
429/// # Arguments
430///
431/// - `samples`: The samples in the dataset
432/// - `k`: The number of clusters
433/// - `labels`: The cluster labels for each sample
434fn calc_pairwise_distances(
435 samples: ArrayView2<'_, Feature>,
436 k: usize,
437 labels: ArrayView1<'_, usize>,
438) -> Array1<Feature> {
439 debug_assert_eq!(
440 samples.nrows(),
441 labels.len(),
442 "Samples and labels must have the same length"
443 );
444 debug_assert_eq!(
445 k,
446 labels.iter().max().unwrap() + 1,
447 "Labels must be in the range 0..k"
448 );
449
450 // for each cluster, calculate the sum of the pairwise distances between samples in that cluster
451 let mut distances = Array1::zeros(k);
452 let mut clusters = vec![Vec::new(); k];
453 // build clusters
454 for (sample, label) in samples.outer_iter().zip(labels.iter()) {
455 clusters[*label].push(sample);
456 }
457 // calculate pairwise dist. within each cluster
458 for (k, cluster) in clusters.iter().enumerate() {
459 let mut pairwise_dists = 0.;
460 for i in 0..cluster.len() - 1 {
461 let a = cluster[i];
462 let rest = &cluster[i + 1..];
463 for &b in rest {
464 pairwise_dists += L2Dist.distance(a, b);
465 }
466 }
467 distances[k] += pairwise_dists + pairwise_dists;
468 }
469 distances
470}
471
472/// Functions available for Initialized state
473impl ClusteringHelper<Initialized> {
474 /// Cluster the data into k clusters
475 ///
476 /// # Errors
477 ///
478 /// Will return an error if the clustering fails
479 #[must_use]
480 #[inline]
481 pub fn cluster(self) -> ClusteringHelper<Finished> {
482 let Initialized {
483 clustering_method,
484 embeddings,
485 k,
486 } = self.state;
487
488 let embedding_dataset = Dataset::from(embeddings);
489 let labels = clustering_method.fit(k, &embedding_dataset);
490
491 ClusteringHelper {
492 state: Finished { labels, k },
493 }
494 }
495}
496
497/// Functions available for Finished state
498impl ClusteringHelper<Finished> {
499 /// use the labels to reorganize the provided samples into clusters
500 #[must_use]
501 #[inline]
502 pub fn extract_analysis_clusters<T: Clone>(&self, samples: Vec<T>) -> Vec<Vec<T>> {
503 let mut clusters = vec![Vec::new(); self.state.k];
504
505 for (sample, &label) in samples.into_iter().zip(self.state.labels.iter()) {
506 clusters[label].push(sample);
507 }
508
509 clusters
510 }
511}
512
513#[cfg(test)]
514mod tests {
515 use super::*;
516 use ndarray::{arr1, arr2, s};
517 use ndarray_rand::rand_distr::StandardNormal;
518 use pretty_assertions::assert_eq;
519 use rstest::rstest;
520
521 #[test]
522 fn test_generate_reference_data_set() {
523 let data = arr2(&[[10.0, -10.0], [20.0, -20.0], [30.0, -30.0]]);
524
525 let ref_data = generate_ref_single(data.view());
526
527 // First column all vals between 10.0 and 30.0
528 assert!(
529 ref_data
530 .slice(s![.., 0])
531 .iter()
532 .all(|v| *v >= 10.0 && *v <= 30.0)
533 );
534
535 // Second column all vals between -10.0 and -30.0
536 assert!(
537 ref_data
538 .slice(s![.., 1])
539 .iter()
540 .all(|v| *v <= -10.0 && *v >= -30.0)
541 );
542
543 // check that the shape is correct
544 assert_eq!(ref_data.shape(), data.shape());
545
546 // check that the data is not the same as the original data
547 assert_ne!(ref_data, data);
548 }
549
550 #[test]
551 fn test_pairwise_distances() {
552 let samples = arr2(&[[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.0, 2.0]]);
553 let labels = arr1(&[0, 0, 1, 1]);
554
555 let pairwise_distances = calc_pairwise_distances(samples.view(), 2, labels.view());
556
557 assert!(
558 f32::EPSILON > (pairwise_distances[0] - 0.0).abs(),
559 "{} != 0.0",
560 pairwise_distances[0]
561 );
562 assert!(
563 f32::EPSILON > (pairwise_distances[1] - 0.0).abs(),
564 "{} != 0.0",
565 pairwise_distances[1]
566 );
567
568 let samples = arr2(&[[1.0, 2.0], [1.0, 1.0], [2.0, 2.0], [2.0, 3.0]]);
569
570 let pairwise_distances = calc_pairwise_distances(samples.view(), 2, labels.view());
571
572 assert!(
573 f32::EPSILON > (pairwise_distances[0] - 2.0).abs(),
574 "{} != 2.0",
575 pairwise_distances[0]
576 );
577 assert!(
578 f32::EPSILON > (pairwise_distances[1] - 2.0).abs(),
579 "{} != 2.0",
580 pairwise_distances[1]
581 );
582 }
583
584 #[test]
585 fn test_calc_within_dispersion() {
586 let labels = arr1(&[0, 1, 0, 1]);
587 let pairwise_distances = arr1(&[1.0, 2.0]);
588 let result = calc_within_dispersion(labels.view(), 2, pairwise_distances.view());
589
590 // `W_k = \sum_{r=1}^{k} \frac{D_r}{2*n_r}` = 1/4 * 1.0 + 1/4 * 2.0 = 0.25 + 0.5 = 0.75
591 assert!(f32::EPSILON > (result - 0.75).abs(), "{result} != 0.75");
592 }
593
594 #[rstest]
595 #[case::project_none(ProjectionMethod::None, NUMBER_FEATURES)]
596 #[case::project_tsne(ProjectionMethod::TSne, EMBEDDING_SIZE)]
597 #[case::project_pca(ProjectionMethod::Pca, EMBEDDING_SIZE)]
598 fn test_project(
599 #[case] projection_method: ProjectionMethod,
600 #[case] expected_embedding_size: usize,
601 ) {
602 // generate 100 random samples, we use a normal distribution because with a uniform distribution
603 // the data has no real "principle components" and PCA will not work as expected since almost all the eigenvalues
604 // with fall below the cutoff
605 let mut samples = Array2::random((100, NUMBER_FEATURES), StandardNormal);
606 normalize_embeddings_inplace(&mut samples);
607
608 let result = projection_method.project(samples).unwrap();
609
610 // ensure embeddings are the correct shape
611 assert_eq!(result.shape(), &[100, expected_embedding_size]);
612
613 // ensure the data is normalized
614 for i in 0..expected_embedding_size {
615 let min = result.column(i).min().copied().unwrap_or_default();
616 let max = result.column(i).max().copied().unwrap_or_default();
617 assert!(
618 f32::EPSILON > (min + 1.0).abs(),
619 "Min value of column {i} is not -1.0: {min}",
620 );
621 assert!(
622 f32::EPSILON > (max - 1.0).abs(),
623 "Max value of column {i} is not 1.0: {max}",
624 );
625 }
626 }
627}
628
629// #[cfg(feature = "plot_gap")]
630// fn plot_gap_statistic(data: Vec<(usize, f64, f64)>) {
631// use plotters::prelude::*;
632
633// // Assuming data is a Vec<(usize, f64, f64)> of (k, gap_k, s_k)
634// let root_area = BitMapBackend::new("gap_statistic_plot.png", (640, 480)).into_drawing_area();
635// root_area.fill(&WHITE).unwrap();
636
637// let max_gap_k = data
638// .iter()
639// .map(|(_, gap_k, _)| *gap_k)
640// .fold(f64::MIN, f64::max);
641// let min_gap_k = data
642// .iter()
643// .map(|(_, gap_k, _)| *gap_k)
644// .fold(f64::MAX, f64::min);
645// let max_k = data.iter().map(|(k, _, _)| *k).max().unwrap_or(0);
646
647// let mut chart = ChartBuilder::on(&root_area)
648// .caption("Gap Statistic Plot", ("sans-serif", 30))
649// .margin(5)
650// .x_label_area_size(30)
651// .y_label_area_size(30)
652// .build_cartesian_2d(0..max_k, min_gap_k..max_gap_k)
653// .unwrap();
654
655// chart.configure_mesh().draw().unwrap();
656
657// for (k, gap_k, s_k) in data {
658// chart
659// .draw_series(PointSeries::of_element(
660// vec![(k, gap_k)],
661// 5,
662// &RED,
663// &|coord, size, style| {
664// EmptyElement::at(coord) + Circle::new((0, 0), size, style.filled())
665// },
666// ))
667// .unwrap();
668
669// // Drawing error bars
670// chart
671// .draw_series(LineSeries::new(
672// vec![(k, gap_k - s_k), (k, gap_k + s_k)],
673// &BLACK,
674// ))
675// .unwrap();
676// }
677
678// root_area.present().unwrap();
679// }