Skip to main content

cortex_runtime/map/
builder.rs

1//! SiteMapBuilder for incrementally constructing a SiteMap.
2
3use crate::map::types::*;
4use std::time::{SystemTime, UNIX_EPOCH};
5
6/// Intermediate edge data during building.
7struct EdgeData {
8    from: u32,
9    edge: EdgeRecord,
10}
11
12/// Intermediate action data during building.
13struct ActionData {
14    node: u32,
15    action: ActionRecord,
16}
17
18/// Builder for constructing a SiteMap incrementally.
19pub struct SiteMapBuilder {
20    domain: String,
21    urls: Vec<String>,
22    nodes: Vec<NodeRecord>,
23    features: Vec<[f32; FEATURE_DIM]>,
24    edges: Vec<EdgeData>,
25    actions: Vec<ActionData>,
26    has_sitemap: bool,
27}
28
29impl SiteMapBuilder {
30    /// Create a new builder for the given domain.
31    pub fn new(domain: &str) -> Self {
32        Self {
33            domain: domain.to_string(),
34            urls: Vec::new(),
35            nodes: Vec::new(),
36            features: Vec::new(),
37            edges: Vec::new(),
38            actions: Vec::new(),
39            has_sitemap: false,
40        }
41    }
42
43    /// Set whether the site had a sitemap.xml.
44    pub fn set_has_sitemap(&mut self, has: bool) {
45        self.has_sitemap = has;
46    }
47
48    /// Add a node and return its index.
49    pub fn add_node(
50        &mut self,
51        url: &str,
52        page_type: PageType,
53        features: [f32; FEATURE_DIM],
54        confidence: u8,
55    ) -> u32 {
56        let index = self.nodes.len() as u32;
57
58        // Compute feature norm (L2)
59        let norm: f32 = features.iter().map(|f| f * f).sum::<f32>().sqrt();
60
61        let record = NodeRecord {
62            page_type,
63            confidence,
64            freshness: 0,
65            flags: NodeFlags::default(),
66            content_hash: 0,
67            rendered_at: 0,
68            http_status: 0,
69            depth: 0,
70            inbound_count: 0,
71            outbound_count: 0,
72            feature_norm: norm,
73            reserved: 0,
74        };
75
76        self.urls.push(url.to_string());
77        self.nodes.push(record);
78        self.features.push(features);
79
80        index
81    }
82
83    /// Add an edge between two nodes.
84    pub fn add_edge(
85        &mut self,
86        from: u32,
87        to: u32,
88        edge_type: EdgeType,
89        weight: u8,
90        flags: EdgeFlags,
91    ) {
92        self.edges.push(EdgeData {
93            from,
94            edge: EdgeRecord {
95                target_node: to,
96                edge_type,
97                weight,
98                flags,
99                reserved: 0,
100            },
101        });
102    }
103
104    /// Add an action available on a node.
105    pub fn add_action(
106        &mut self,
107        node: u32,
108        opcode: OpCode,
109        target_node: i32,
110        cost_hint: u8,
111        risk: u8,
112    ) {
113        self.actions.push(ActionData {
114            node,
115            action: ActionRecord {
116                opcode,
117                target_node,
118                cost_hint,
119                risk,
120                http_executable: false,
121            },
122        });
123    }
124
125    /// Add an HTTP-executable action available on a node.
126    ///
127    /// HTTP-executable actions can be executed via HTTP POST/GET without a browser.
128    pub fn add_action_http(
129        &mut self,
130        node: u32,
131        opcode: OpCode,
132        target_node: i32,
133        cost_hint: u8,
134        risk: u8,
135    ) {
136        self.actions.push(ActionData {
137            node,
138            action: ActionRecord {
139                opcode,
140                target_node,
141                cost_hint,
142                risk,
143                http_executable: true,
144            },
145        });
146    }
147
148    /// Mark a node as rendered with updated features and flags.
149    pub fn set_rendered(&mut self, node: u32, features: [f32; FEATURE_DIM]) {
150        let idx = node as usize;
151        if idx < self.nodes.len() {
152            self.nodes[idx].flags.0 |= NodeFlags::RENDERED;
153            self.nodes[idx].freshness = 255;
154            self.features[idx] = features;
155            let norm: f32 = features.iter().map(|f| f * f).sum::<f32>().sqrt();
156            self.nodes[idx].feature_norm = norm;
157        }
158    }
159
160    /// Read a single feature dimension for an existing node.
161    pub fn get_feature(&self, node: u32, dimension: usize) -> f32 {
162        let idx = node as usize;
163        if idx < self.features.len() && dimension < FEATURE_DIM {
164            self.features[idx][dimension]
165        } else {
166            0.0
167        }
168    }
169
170    /// Update a single feature dimension for an existing node.
171    ///
172    /// Used to patch feature vectors after initial encoding — for example,
173    /// to inject HTTP action counts discovered in Layer 2.5.
174    pub fn update_feature(&mut self, node: u32, dimension: usize, value: f32) {
175        let idx = node as usize;
176        if idx < self.features.len() && dimension < FEATURE_DIM {
177            self.features[idx][dimension] = value;
178        }
179    }
180
181    /// Merge additional flag bits into a node's flags.
182    ///
183    /// Used to set computed flags like `HAS_PRICE`, `HAS_MEDIA`, `HAS_FORM`
184    /// after feature encoding.
185    pub fn merge_flags(&mut self, node: u32, flags: NodeFlags) {
186        let idx = node as usize;
187        if idx < self.nodes.len() {
188            self.nodes[idx].flags.0 |= flags.0;
189        }
190    }
191
192    /// Build the final SiteMap.
193    pub fn build(mut self) -> SiteMap {
194        let node_count = self.nodes.len();
195
196        // Sort edges by source node for CSR format
197        self.edges.sort_by_key(|e| e.from);
198
199        // Build edge CSR index
200        let mut edge_index = vec![0u32; node_count + 1];
201        let mut edges = Vec::with_capacity(self.edges.len());
202        for ed in &self.edges {
203            edges.push(ed.edge.clone());
204        }
205
206        // Count edges per node
207        let mut counts = vec![0u32; node_count];
208        for ed in &self.edges {
209            if (ed.from as usize) < node_count {
210                counts[ed.from as usize] += 1;
211            }
212        }
213        // Build prefix sum
214        for i in 0..node_count {
215            edge_index[i + 1] = edge_index[i] + counts[i];
216        }
217
218        // Update inbound/outbound counts
219        for ed in &self.edges {
220            let from = ed.from as usize;
221            let to = ed.edge.target_node as usize;
222            if from < node_count {
223                self.nodes[from].outbound_count = self.nodes[from].outbound_count.saturating_add(1);
224            }
225            if to < node_count {
226                self.nodes[to].inbound_count = self.nodes[to].inbound_count.saturating_add(1);
227            }
228        }
229
230        // Sort actions by node for CSR format
231        self.actions.sort_by_key(|a| a.node);
232
233        // Build action CSR index
234        let mut action_index = vec![0u32; node_count + 1];
235        let mut actions = Vec::with_capacity(self.actions.len());
236        for ad in &self.actions {
237            actions.push(ad.action.clone());
238        }
239        let mut act_counts = vec![0u32; node_count];
240        for ad in &self.actions {
241            if (ad.node as usize) < node_count {
242                act_counts[ad.node as usize] += 1;
243            }
244        }
245        for i in 0..node_count {
246            action_index[i + 1] = action_index[i] + act_counts[i];
247        }
248
249        // Compute clusters using basic k-means
250        let k = if node_count < 30 {
251            1.max(node_count / 3)
252        } else {
253            3.max(((node_count as f64 / 10.0).sqrt()) as usize)
254        };
255        let (cluster_assignments, cluster_centroids) = compute_clusters(&self.features, k);
256
257        let mapped_at = SystemTime::now()
258            .duration_since(UNIX_EPOCH)
259            .unwrap_or_default()
260            .as_secs();
261
262        let mut flags: u16 = 0;
263        if self.has_sitemap {
264            flags |= 1;
265        }
266
267        let header = MapHeader {
268            magic: SITEMAP_MAGIC,
269            format_version: FORMAT_VERSION,
270            domain: self.domain,
271            mapped_at,
272            node_count: node_count as u32,
273            edge_count: edges.len() as u32,
274            cluster_count: cluster_centroids.len() as u16,
275            flags,
276        };
277
278        SiteMap {
279            header,
280            nodes: self.nodes,
281            edges,
282            edge_index,
283            features: self.features,
284            actions,
285            action_index,
286            cluster_assignments,
287            cluster_centroids,
288            urls: self.urls,
289        }
290    }
291}
292
293/// Simple k-means clustering on feature vectors.
294fn compute_clusters(
295    features: &[[f32; FEATURE_DIM]],
296    k: usize,
297) -> (Vec<u16>, Vec<[f32; FEATURE_DIM]>) {
298    let n = features.len();
299    if n == 0 || k == 0 {
300        return (Vec::new(), Vec::new());
301    }
302    let k = k.min(n);
303
304    // Initialize centroids by evenly spacing through the data
305    let mut centroids: Vec<[f32; FEATURE_DIM]> = Vec::with_capacity(k);
306    for i in 0..k {
307        let idx = i * n / k;
308        centroids.push(features[idx]);
309    }
310
311    let mut assignments = vec![0u16; n];
312
313    // Run k-means for up to 20 iterations
314    for _ in 0..20 {
315        let mut changed = false;
316
317        // Assign each point to nearest centroid
318        for (i, feat) in features.iter().enumerate() {
319            let mut best_cluster = 0u16;
320            let mut best_dist = f32::MAX;
321            for (c, centroid) in centroids.iter().enumerate() {
322                let dist: f32 = feat
323                    .iter()
324                    .zip(centroid.iter())
325                    .map(|(a, b)| (a - b) * (a - b))
326                    .sum();
327                if dist < best_dist {
328                    best_dist = dist;
329                    best_cluster = c as u16;
330                }
331            }
332            if assignments[i] != best_cluster {
333                assignments[i] = best_cluster;
334                changed = true;
335            }
336        }
337
338        if !changed {
339            break;
340        }
341
342        // Recompute centroids
343        let mut sums = vec![[0.0f32; FEATURE_DIM]; k];
344        let mut counts = vec![0u32; k];
345        for (i, feat) in features.iter().enumerate() {
346            let c = assignments[i] as usize;
347            counts[c] += 1;
348            for (d, &val) in feat.iter().enumerate() {
349                sums[c][d] += val;
350            }
351        }
352        for c in 0..k {
353            if counts[c] > 0 {
354                for (d, sum_val) in sums[c].iter().enumerate() {
355                    centroids[c][d] = sum_val / counts[c] as f32;
356                }
357            }
358        }
359    }
360
361    (assignments, centroids)
362}