Skip to main content

cortex_runtime/navigation/
cluster.rs

1//! Cluster detection and management for SiteMap nodes.
2
3use crate::map::types::{PageType, SiteMap, FEATURE_DIM};
4
5/// Recompute clusters for a SiteMap.
6///
7/// Uses k-means clustering on feature vectors.
8/// k = max(3, sqrt(node_count / 10))
9pub fn compute_clusters(map: &mut SiteMap) {
10    let n = map.nodes.len();
11    if n == 0 {
12        return;
13    }
14
15    let k = if n < 30 {
16        1.max(n / 3)
17    } else {
18        3.max(((n as f64 / 10.0).sqrt()) as usize)
19    };
20    let k = k.min(n);
21
22    // Initialize centroids by evenly spacing through the data
23    let mut centroids: Vec<[f32; FEATURE_DIM]> = Vec::with_capacity(k);
24    for i in 0..k {
25        let idx = i * n / k;
26        centroids.push(map.features[idx]);
27    }
28
29    let mut assignments = vec![0u16; n];
30
31    // Run k-means for up to 20 iterations
32    for _ in 0..20 {
33        let mut changed = false;
34
35        // Assign each point to nearest centroid
36        for (i, feat) in map.features.iter().enumerate() {
37            let mut best_cluster = 0u16;
38            let mut best_dist = f32::MAX;
39            for (c, centroid) in centroids.iter().enumerate() {
40                let dist: f32 = feat
41                    .iter()
42                    .zip(centroid.iter())
43                    .map(|(a, b)| (a - b) * (a - b))
44                    .sum();
45                if dist < best_dist {
46                    best_dist = dist;
47                    best_cluster = c as u16;
48                }
49            }
50            if assignments[i] != best_cluster {
51                assignments[i] = best_cluster;
52                changed = true;
53            }
54        }
55
56        if !changed {
57            break;
58        }
59
60        // Recompute centroids
61        let mut sums = vec![[0.0f32; FEATURE_DIM]; k];
62        let mut counts = vec![0u32; k];
63        for (i, feat) in map.features.iter().enumerate() {
64            let c = assignments[i] as usize;
65            counts[c] += 1;
66            for (d, &val) in feat.iter().enumerate() {
67                sums[c][d] += val;
68            }
69        }
70        for c in 0..k {
71            if counts[c] > 0 {
72                for (d, sum_val) in sums[c].iter().enumerate() {
73                    centroids[c][d] = sum_val / counts[c] as f32;
74                }
75            }
76        }
77    }
78
79    map.cluster_assignments = assignments;
80    map.cluster_centroids = centroids;
81    map.header.cluster_count = k as u16;
82}
83
84/// Get the dominant PageType for a cluster.
85pub fn cluster_type(map: &SiteMap, cluster_id: u16) -> PageType {
86    let mut type_counts = std::collections::HashMap::new();
87
88    for (i, &assignment) in map.cluster_assignments.iter().enumerate() {
89        if assignment == cluster_id {
90            *type_counts.entry(map.nodes[i].page_type).or_insert(0u32) += 1;
91        }
92    }
93
94    type_counts
95        .into_iter()
96        .max_by_key(|&(_, count)| count)
97        .map(|(pt, _)| pt)
98        .unwrap_or(PageType::Unknown)
99}
100
101/// Get all node indices belonging to a cluster.
102pub fn cluster_members(map: &SiteMap, cluster_id: u16) -> Vec<u32> {
103    map.cluster_assignments
104        .iter()
105        .enumerate()
106        .filter(|(_, &a)| a == cluster_id)
107        .map(|(i, _)| i as u32)
108        .collect()
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114    use crate::map::builder::SiteMapBuilder;
115    use crate::map::types::*;
116
117    #[test]
118    fn test_compute_clusters() {
119        let mut builder = SiteMapBuilder::new("test.com");
120
121        // Group A: high dimension 0, zero on dimension 5
122        for i in 0..10 {
123            let mut feats = [0.0f32; FEATURE_DIM];
124            feats[0] = 10.0 + (i as f32 * 0.1);
125            feats[5] = 0.0;
126            builder.add_node(
127                &format!("https://test.com/a/{i}"),
128                PageType::Article,
129                feats,
130                200,
131            );
132        }
133
134        // Group B: high dimension 5, zero on dimension 0
135        for i in 0..10 {
136            let mut feats = [0.0f32; FEATURE_DIM];
137            feats[0] = 0.0;
138            feats[5] = 10.0 + (i as f32 * 0.1);
139            builder.add_node(
140                &format!("https://test.com/p/{i}"),
141                PageType::ProductDetail,
142                feats,
143                200,
144            );
145        }
146
147        let mut map = builder.build();
148        compute_clusters(&mut map);
149
150        assert!(!map.cluster_centroids.is_empty());
151        assert_eq!(map.cluster_assignments.len(), 20);
152
153        // No node from group B should share a cluster with any node from group A
154        // (groups are far apart in feature space)
155        let group_a_clusters: std::collections::HashSet<u16> =
156            (0..10).map(|i| map.cluster_assignments[i]).collect();
157        let group_b_clusters: std::collections::HashSet<u16> =
158            (10..20).map(|i| map.cluster_assignments[i]).collect();
159
160        // The two groups should have no overlapping clusters
161        assert!(
162            group_a_clusters.is_disjoint(&group_b_clusters),
163            "Group A clusters {:?} should not overlap with Group B clusters {:?}",
164            group_a_clusters,
165            group_b_clusters,
166        );
167    }
168
169    #[test]
170    fn test_cluster_type() {
171        let mut builder = SiteMapBuilder::new("test.com");
172        let feats = [0.0f32; FEATURE_DIM];
173
174        builder.add_node("https://test.com/a1", PageType::Article, feats, 200);
175        builder.add_node("https://test.com/a2", PageType::Article, feats, 200);
176        builder.add_node("https://test.com/p1", PageType::ProductDetail, feats, 200);
177
178        let mut map = builder.build();
179        // Force all into cluster 0
180        map.cluster_assignments = vec![0, 0, 0];
181
182        let dominant = cluster_type(&map, 0);
183        assert_eq!(dominant, PageType::Article); // 2 articles vs 1 product
184    }
185
186    #[test]
187    fn test_cluster_members() {
188        let mut builder = SiteMapBuilder::new("test.com");
189        let feats = [0.0f32; FEATURE_DIM];
190
191        builder.add_node("https://test.com/a", PageType::Article, feats, 200);
192        builder.add_node("https://test.com/b", PageType::Article, feats, 200);
193        builder.add_node("https://test.com/c", PageType::Article, feats, 200);
194
195        let mut map = builder.build();
196        map.cluster_assignments = vec![0, 1, 0];
197
198        let members = cluster_members(&map, 0);
199        assert_eq!(members, vec![0, 2]);
200
201        let members = cluster_members(&map, 1);
202        assert_eq!(members, vec![1]);
203    }
204}