Skip to main content

iqdb_ivf/
stats.rs

1//! [`IvfClusterStats`] — diagnostic snapshot of cluster occupancy.
2//!
3//! Exposed through [`crate::IvfIndex::cluster_stats`] for operators
4//! debugging cluster imbalance. A healthy IVF index has roughly
5//! uniform `cluster_sizes`; a long tail of empty or near-empty
6//! clusters next to a few packed ones indicates centroid drift and is
7//! the signal that motivates the `retrain()` workflow.
8
9/// Diagnostic snapshot of inverted-list occupancy.
10///
11/// Returned by [`crate::IvfIndex::cluster_stats`] after the index has
12/// been trained and (typically) populated. Before training the
13/// `cluster_sizes` slice is empty and `avg_size` / `size_variance`
14/// are `0.0`.
15///
16/// # Examples
17///
18/// ```
19/// use iqdb_ivf::IvfClusterStats;
20///
21/// // Synthetic example — IvfIndex::cluster_stats returns the real one.
22/// let stats = IvfClusterStats {
23///     n_clusters: 4,
24///     cluster_sizes: vec![3, 3, 4, 2],
25///     avg_size: 3.0,
26///     size_variance: 0.5,
27/// };
28/// assert_eq!(stats.cluster_sizes.iter().sum::<usize>(), 12);
29/// ```
30#[derive(Debug, Clone, PartialEq)]
31pub struct IvfClusterStats {
32    /// Number of partitions the index was configured with.
33    ///
34    /// Echoed from [`crate::IvfConfig::n_clusters`] so a caller does
35    /// not need to also carry the config to interpret the snapshot.
36    pub n_clusters: usize,
37
38    /// Live vector count per cluster, indexed by cluster id.
39    ///
40    /// Always either empty (before training) or of length
41    /// [`Self::n_clusters`]. A cluster size of `0` is meaningful and
42    /// is what motivates `retrain()` work in a follow-up.
43    pub cluster_sizes: Vec<usize>,
44
45    /// Arithmetic mean of [`Self::cluster_sizes`] as `f32`.
46    ///
47    /// `0.0` when [`Self::cluster_sizes`] is empty.
48    pub avg_size: f32,
49
50    /// Population variance of [`Self::cluster_sizes`] as `f32`.
51    ///
52    /// `0.0` when [`Self::cluster_sizes`] is empty. Higher values
53    /// signal cluster imbalance.
54    pub size_variance: f32,
55}
56
57impl IvfClusterStats {
58    /// Build an `IvfClusterStats` from the cluster-size vector.
59    ///
60    /// Computes `avg_size` and `size_variance` in a single sequential
61    /// pass over `cluster_sizes`, in fixed iteration order, so the
62    /// snapshot is deterministic and free of any cross-platform
63    /// float-reduction drift.
64    #[must_use]
65    pub(crate) fn from_sizes(n_clusters: usize, cluster_sizes: Vec<usize>) -> Self {
66        let len = cluster_sizes.len();
67        if len == 0 {
68            return Self {
69                n_clusters,
70                cluster_sizes,
71                avg_size: 0.0,
72                size_variance: 0.0,
73            };
74        }
75        // f64 reductions for cross-platform-stable arithmetic; downcast
76        // at the end. Matches the train.rs reduction discipline.
77        let mut sum: f64 = 0.0;
78        for &s in &cluster_sizes {
79            sum += s as f64;
80        }
81        let avg = sum / (len as f64);
82        let mut var_sum: f64 = 0.0;
83        for &s in &cluster_sizes {
84            let d = (s as f64) - avg;
85            var_sum += d * d;
86        }
87        let variance = var_sum / (len as f64);
88        Self {
89            n_clusters,
90            cluster_sizes,
91            avg_size: avg as f32,
92            size_variance: variance as f32,
93        }
94    }
95}
96
97#[cfg(test)]
98mod tests {
99    #![allow(clippy::unwrap_used)]
100
101    use super::*;
102
103    #[test]
104    fn empty_sizes_yields_zero_avg_and_variance() {
105        let stats = IvfClusterStats::from_sizes(4, Vec::new());
106        assert_eq!(stats.n_clusters, 4);
107        assert!(stats.cluster_sizes.is_empty());
108        assert_eq!(stats.avg_size, 0.0);
109        assert_eq!(stats.size_variance, 0.0);
110    }
111
112    #[test]
113    fn uniform_sizes_have_zero_variance() {
114        let stats = IvfClusterStats::from_sizes(4, vec![5, 5, 5, 5]);
115        assert_eq!(stats.avg_size, 5.0);
116        assert_eq!(stats.size_variance, 0.0);
117    }
118
119    #[test]
120    fn variance_matches_hand_computation() {
121        // sizes: [1, 3, 5, 7]; mean = 4; deviations = [-3, -1, 1, 3];
122        // variance = (9 + 1 + 1 + 9) / 4 = 5.
123        let stats = IvfClusterStats::from_sizes(4, vec![1, 3, 5, 7]);
124        assert!((stats.avg_size - 4.0).abs() < 1e-6);
125        assert!((stats.size_variance - 5.0).abs() < 1e-6);
126    }
127}