1use crate::map::types::*;
4use std::time::{SystemTime, UNIX_EPOCH};
5
6struct EdgeData {
8 from: u32,
9 edge: EdgeRecord,
10}
11
12struct ActionData {
14 node: u32,
15 action: ActionRecord,
16}
17
18pub struct SiteMapBuilder {
20 domain: String,
21 urls: Vec<String>,
22 nodes: Vec<NodeRecord>,
23 features: Vec<[f32; FEATURE_DIM]>,
24 edges: Vec<EdgeData>,
25 actions: Vec<ActionData>,
26 has_sitemap: bool,
27}
28
29impl SiteMapBuilder {
30 pub fn new(domain: &str) -> Self {
32 Self {
33 domain: domain.to_string(),
34 urls: Vec::new(),
35 nodes: Vec::new(),
36 features: Vec::new(),
37 edges: Vec::new(),
38 actions: Vec::new(),
39 has_sitemap: false,
40 }
41 }
42
43 pub fn set_has_sitemap(&mut self, has: bool) {
45 self.has_sitemap = has;
46 }
47
48 pub fn add_node(
50 &mut self,
51 url: &str,
52 page_type: PageType,
53 features: [f32; FEATURE_DIM],
54 confidence: u8,
55 ) -> u32 {
56 let index = self.nodes.len() as u32;
57
58 let norm: f32 = features.iter().map(|f| f * f).sum::<f32>().sqrt();
60
61 let record = NodeRecord {
62 page_type,
63 confidence,
64 freshness: 0,
65 flags: NodeFlags::default(),
66 content_hash: 0,
67 rendered_at: 0,
68 http_status: 0,
69 depth: 0,
70 inbound_count: 0,
71 outbound_count: 0,
72 feature_norm: norm,
73 reserved: 0,
74 };
75
76 self.urls.push(url.to_string());
77 self.nodes.push(record);
78 self.features.push(features);
79
80 index
81 }
82
83 pub fn add_edge(
85 &mut self,
86 from: u32,
87 to: u32,
88 edge_type: EdgeType,
89 weight: u8,
90 flags: EdgeFlags,
91 ) {
92 self.edges.push(EdgeData {
93 from,
94 edge: EdgeRecord {
95 target_node: to,
96 edge_type,
97 weight,
98 flags,
99 reserved: 0,
100 },
101 });
102 }
103
104 pub fn add_action(
106 &mut self,
107 node: u32,
108 opcode: OpCode,
109 target_node: i32,
110 cost_hint: u8,
111 risk: u8,
112 ) {
113 self.actions.push(ActionData {
114 node,
115 action: ActionRecord {
116 opcode,
117 target_node,
118 cost_hint,
119 risk,
120 http_executable: false,
121 },
122 });
123 }
124
125 pub fn add_action_http(
129 &mut self,
130 node: u32,
131 opcode: OpCode,
132 target_node: i32,
133 cost_hint: u8,
134 risk: u8,
135 ) {
136 self.actions.push(ActionData {
137 node,
138 action: ActionRecord {
139 opcode,
140 target_node,
141 cost_hint,
142 risk,
143 http_executable: true,
144 },
145 });
146 }
147
148 pub fn set_rendered(&mut self, node: u32, features: [f32; FEATURE_DIM]) {
150 let idx = node as usize;
151 if idx < self.nodes.len() {
152 self.nodes[idx].flags.0 |= NodeFlags::RENDERED;
153 self.nodes[idx].freshness = 255;
154 self.features[idx] = features;
155 let norm: f32 = features.iter().map(|f| f * f).sum::<f32>().sqrt();
156 self.nodes[idx].feature_norm = norm;
157 }
158 }
159
160 pub fn get_feature(&self, node: u32, dimension: usize) -> f32 {
162 let idx = node as usize;
163 if idx < self.features.len() && dimension < FEATURE_DIM {
164 self.features[idx][dimension]
165 } else {
166 0.0
167 }
168 }
169
170 pub fn update_feature(&mut self, node: u32, dimension: usize, value: f32) {
175 let idx = node as usize;
176 if idx < self.features.len() && dimension < FEATURE_DIM {
177 self.features[idx][dimension] = value;
178 }
179 }
180
181 pub fn merge_flags(&mut self, node: u32, flags: NodeFlags) {
186 let idx = node as usize;
187 if idx < self.nodes.len() {
188 self.nodes[idx].flags.0 |= flags.0;
189 }
190 }
191
192 pub fn build(mut self) -> SiteMap {
194 let node_count = self.nodes.len();
195
196 self.edges.sort_by_key(|e| e.from);
198
199 let mut edge_index = vec![0u32; node_count + 1];
201 let mut edges = Vec::with_capacity(self.edges.len());
202 for ed in &self.edges {
203 edges.push(ed.edge.clone());
204 }
205
206 let mut counts = vec![0u32; node_count];
208 for ed in &self.edges {
209 if (ed.from as usize) < node_count {
210 counts[ed.from as usize] += 1;
211 }
212 }
213 for i in 0..node_count {
215 edge_index[i + 1] = edge_index[i] + counts[i];
216 }
217
218 for ed in &self.edges {
220 let from = ed.from as usize;
221 let to = ed.edge.target_node as usize;
222 if from < node_count {
223 self.nodes[from].outbound_count = self.nodes[from].outbound_count.saturating_add(1);
224 }
225 if to < node_count {
226 self.nodes[to].inbound_count = self.nodes[to].inbound_count.saturating_add(1);
227 }
228 }
229
230 self.actions.sort_by_key(|a| a.node);
232
233 let mut action_index = vec![0u32; node_count + 1];
235 let mut actions = Vec::with_capacity(self.actions.len());
236 for ad in &self.actions {
237 actions.push(ad.action.clone());
238 }
239 let mut act_counts = vec![0u32; node_count];
240 for ad in &self.actions {
241 if (ad.node as usize) < node_count {
242 act_counts[ad.node as usize] += 1;
243 }
244 }
245 for i in 0..node_count {
246 action_index[i + 1] = action_index[i] + act_counts[i];
247 }
248
249 let k = if node_count < 30 {
251 1.max(node_count / 3)
252 } else {
253 3.max(((node_count as f64 / 10.0).sqrt()) as usize)
254 };
255 let (cluster_assignments, cluster_centroids) = compute_clusters(&self.features, k);
256
257 let mapped_at = SystemTime::now()
258 .duration_since(UNIX_EPOCH)
259 .unwrap_or_default()
260 .as_secs();
261
262 let mut flags: u16 = 0;
263 if self.has_sitemap {
264 flags |= 1;
265 }
266
267 let header = MapHeader {
268 magic: SITEMAP_MAGIC,
269 format_version: FORMAT_VERSION,
270 domain: self.domain,
271 mapped_at,
272 node_count: node_count as u32,
273 edge_count: edges.len() as u32,
274 cluster_count: cluster_centroids.len() as u16,
275 flags,
276 };
277
278 SiteMap {
279 header,
280 nodes: self.nodes,
281 edges,
282 edge_index,
283 features: self.features,
284 actions,
285 action_index,
286 cluster_assignments,
287 cluster_centroids,
288 urls: self.urls,
289 }
290 }
291}
292
293fn compute_clusters(
295 features: &[[f32; FEATURE_DIM]],
296 k: usize,
297) -> (Vec<u16>, Vec<[f32; FEATURE_DIM]>) {
298 let n = features.len();
299 if n == 0 || k == 0 {
300 return (Vec::new(), Vec::new());
301 }
302 let k = k.min(n);
303
304 let mut centroids: Vec<[f32; FEATURE_DIM]> = Vec::with_capacity(k);
306 for i in 0..k {
307 let idx = i * n / k;
308 centroids.push(features[idx]);
309 }
310
311 let mut assignments = vec![0u16; n];
312
313 for _ in 0..20 {
315 let mut changed = false;
316
317 for (i, feat) in features.iter().enumerate() {
319 let mut best_cluster = 0u16;
320 let mut best_dist = f32::MAX;
321 for (c, centroid) in centroids.iter().enumerate() {
322 let dist: f32 = feat
323 .iter()
324 .zip(centroid.iter())
325 .map(|(a, b)| (a - b) * (a - b))
326 .sum();
327 if dist < best_dist {
328 best_dist = dist;
329 best_cluster = c as u16;
330 }
331 }
332 if assignments[i] != best_cluster {
333 assignments[i] = best_cluster;
334 changed = true;
335 }
336 }
337
338 if !changed {
339 break;
340 }
341
342 let mut sums = vec![[0.0f32; FEATURE_DIM]; k];
344 let mut counts = vec![0u32; k];
345 for (i, feat) in features.iter().enumerate() {
346 let c = assignments[i] as usize;
347 counts[c] += 1;
348 for (d, &val) in feat.iter().enumerate() {
349 sums[c][d] += val;
350 }
351 }
352 for c in 0..k {
353 if counts[c] > 0 {
354 for (d, sum_val) in sums[c].iter().enumerate() {
355 centroids[c][d] = sum_val / counts[c] as f32;
356 }
357 }
358 }
359 }
360
361 (assignments, centroids)
362}