iqdb_ivf/stats.rs
1//! [`IvfClusterStats`] — diagnostic snapshot of cluster occupancy.
2//!
3//! Exposed through [`crate::IvfIndex::cluster_stats`] for operators
4//! debugging cluster imbalance. A healthy IVF index has roughly
5//! uniform `cluster_sizes`; a long tail of empty or near-empty
6//! clusters next to a few packed ones indicates centroid drift and is
7//! the signal that motivates the `retrain()` workflow.
8
9/// Diagnostic snapshot of inverted-list occupancy.
10///
11/// Returned by [`crate::IvfIndex::cluster_stats`] after the index has
12/// been trained and (typically) populated. Before training the
13/// `cluster_sizes` slice is empty and `avg_size` / `size_variance`
14/// are `0.0`.
15///
16/// # Examples
17///
18/// ```
19/// use iqdb_ivf::IvfClusterStats;
20///
21/// // Synthetic example — IvfIndex::cluster_stats returns the real one.
22/// let stats = IvfClusterStats {
23/// n_clusters: 4,
24/// cluster_sizes: vec![3, 3, 4, 2],
25/// avg_size: 3.0,
26/// size_variance: 0.5,
27/// };
28/// assert_eq!(stats.cluster_sizes.iter().sum::<usize>(), 12);
29/// ```
30#[derive(Debug, Clone, PartialEq)]
31pub struct IvfClusterStats {
32 /// Number of partitions the index was configured with.
33 ///
34 /// Echoed from [`crate::IvfConfig::n_clusters`] so a caller does
35 /// not need to also carry the config to interpret the snapshot.
36 pub n_clusters: usize,
37
38 /// Live vector count per cluster, indexed by cluster id.
39 ///
40 /// Always either empty (before training) or of length
41 /// [`Self::n_clusters`]. A cluster size of `0` is meaningful and
42 /// is what motivates `retrain()` work in a follow-up.
43 pub cluster_sizes: Vec<usize>,
44
45 /// Arithmetic mean of [`Self::cluster_sizes`] as `f32`.
46 ///
47 /// `0.0` when [`Self::cluster_sizes`] is empty.
48 pub avg_size: f32,
49
50 /// Population variance of [`Self::cluster_sizes`] as `f32`.
51 ///
52 /// `0.0` when [`Self::cluster_sizes`] is empty. Higher values
53 /// signal cluster imbalance.
54 pub size_variance: f32,
55}
56
57impl IvfClusterStats {
58 /// Build an `IvfClusterStats` from the cluster-size vector.
59 ///
60 /// Computes `avg_size` and `size_variance` in a single sequential
61 /// pass over `cluster_sizes`, in fixed iteration order, so the
62 /// snapshot is deterministic and free of any cross-platform
63 /// float-reduction drift.
64 #[must_use]
65 pub(crate) fn from_sizes(n_clusters: usize, cluster_sizes: Vec<usize>) -> Self {
66 let len = cluster_sizes.len();
67 if len == 0 {
68 return Self {
69 n_clusters,
70 cluster_sizes,
71 avg_size: 0.0,
72 size_variance: 0.0,
73 };
74 }
75 // f64 reductions for cross-platform-stable arithmetic; downcast
76 // at the end. Matches the train.rs reduction discipline.
77 let mut sum: f64 = 0.0;
78 for &s in &cluster_sizes {
79 sum += s as f64;
80 }
81 let avg = sum / (len as f64);
82 let mut var_sum: f64 = 0.0;
83 for &s in &cluster_sizes {
84 let d = (s as f64) - avg;
85 var_sum += d * d;
86 }
87 let variance = var_sum / (len as f64);
88 Self {
89 n_clusters,
90 cluster_sizes,
91 avg_size: avg as f32,
92 size_variance: variance as f32,
93 }
94 }
95}
96
97#[cfg(test)]
98mod tests {
99 #![allow(clippy::unwrap_used)]
100
101 use super::*;
102
103 #[test]
104 fn empty_sizes_yields_zero_avg_and_variance() {
105 let stats = IvfClusterStats::from_sizes(4, Vec::new());
106 assert_eq!(stats.n_clusters, 4);
107 assert!(stats.cluster_sizes.is_empty());
108 assert_eq!(stats.avg_size, 0.0);
109 assert_eq!(stats.size_variance, 0.0);
110 }
111
112 #[test]
113 fn uniform_sizes_have_zero_variance() {
114 let stats = IvfClusterStats::from_sizes(4, vec![5, 5, 5, 5]);
115 assert_eq!(stats.avg_size, 5.0);
116 assert_eq!(stats.size_variance, 0.0);
117 }
118
119 #[test]
120 fn variance_matches_hand_computation() {
121 // sizes: [1, 3, 5, 7]; mean = 4; deviations = [-3, -1, 1, 3];
122 // variance = (9 + 1 + 1 + 9) / 4 = 5.
123 let stats = IvfClusterStats::from_sizes(4, vec![1, 3, 5, 7]);
124 assert!((stats.avg_size - 4.0).abs() < 1e-6);
125 assert!((stats.size_variance - 5.0).abs() < 1e-6);
126 }
127}