Skip to main content

cortex_runtime/map/
deserializer.rs

1//! Deserialize a SiteMap from the binary CTX format.
2//!
3//! Verifies the trailing CRC32 checksum to detect corruption.
4
5use crate::map::serializer::crc32;
6use crate::map::types::*;
7use anyhow::{bail, Context, Result};
8use byteorder::{LittleEndian, ReadBytesExt};
9use std::io::Cursor;
10
11impl SiteMap {
12    /// Deserialize a SiteMap from binary CTX format.
13    ///
14    /// Verifies the trailing CRC32 checksum. Returns an error if the
15    /// file is truncated or corrupted.
16    pub fn deserialize(data: &[u8]) -> Result<Self> {
17        // Verify trailing CRC32 checksum (last 4 bytes)
18        if data.len() < 4 {
19            bail!("map file too small: {} bytes", data.len());
20        }
21        let payload = &data[..data.len() - 4];
22        let stored_checksum = {
23            let mut c = Cursor::new(&data[data.len() - 4..]);
24            c.read_u32::<LittleEndian>().context("reading checksum")?
25        };
26        let computed_checksum = crc32(payload);
27        if stored_checksum != computed_checksum {
28            bail!(
29                "map file integrity check failed: checksum mismatch \
30                 (stored 0x{:08X}, computed 0x{:08X}). File may be corrupted.",
31                stored_checksum,
32                computed_checksum
33            );
34        }
35
36        let mut r = Cursor::new(payload);
37
38        // ─── Header ───────────────────────────────────────
39        let magic = r.read_u32::<LittleEndian>().context("reading magic")?;
40        if magic != SITEMAP_MAGIC {
41            bail!(
42                "invalid magic bytes: expected 0x{:08X}, got 0x{:08X}",
43                SITEMAP_MAGIC,
44                magic
45            );
46        }
47
48        let format_version = r.read_u16::<LittleEndian>().context("reading version")?;
49        if format_version != FORMAT_VERSION {
50            bail!("unsupported format version: expected {FORMAT_VERSION}, got {format_version}");
51        }
52
53        let domain_length = r
54            .read_u16::<LittleEndian>()
55            .context("reading domain length")? as usize;
56        let mut domain_bytes = vec![0u8; domain_length];
57        std::io::Read::read_exact(&mut r, &mut domain_bytes).context("reading domain")?;
58        let domain = String::from_utf8(domain_bytes).context("domain not valid utf8")?;
59
60        let mapped_at = r.read_u64::<LittleEndian>().context("reading mapped_at")?;
61        let node_count = r.read_u32::<LittleEndian>().context("reading node_count")? as usize;
62        let edge_count = r.read_u32::<LittleEndian>().context("reading edge_count")? as usize;
63        let cluster_count = r
64            .read_u16::<LittleEndian>()
65            .context("reading cluster_count")? as usize;
66        let flags = r.read_u16::<LittleEndian>().context("reading flags")?;
67
68        // ─── Node Table ───────────────────────────────────
69        let mut nodes = Vec::with_capacity(node_count);
70        for _ in 0..node_count {
71            let page_type = PageType::from_u8(r.read_u8()?);
72            let confidence = r.read_u8()?;
73            let freshness = r.read_u8()?;
74            let node_flags = NodeFlags(r.read_u8()?);
75            let content_hash = r.read_u32::<LittleEndian>()?;
76            let rendered_at = r.read_u32::<LittleEndian>()?;
77            let http_status = r.read_u16::<LittleEndian>()?;
78            let depth = r.read_u16::<LittleEndian>()?;
79            let inbound_count = r.read_u16::<LittleEndian>()?;
80            let outbound_count = r.read_u16::<LittleEndian>()?;
81            let feature_norm = r.read_f32::<LittleEndian>()?;
82            let reserved = r.read_u32::<LittleEndian>()?;
83
84            nodes.push(NodeRecord {
85                page_type,
86                confidence,
87                freshness,
88                flags: node_flags,
89                content_hash,
90                rendered_at,
91                http_status,
92                depth,
93                inbound_count,
94                outbound_count,
95                feature_norm,
96                reserved,
97            });
98        }
99
100        // ─── Edge Table ───────────────────────────────────
101        let mut edges = Vec::with_capacity(edge_count);
102        for _ in 0..edge_count {
103            let target_node = r.read_u32::<LittleEndian>()?;
104            let edge_type = EdgeType::from_u8(r.read_u8()?);
105            let weight = r.read_u8()?;
106            let edge_flags = EdgeFlags(r.read_u8()?);
107            let reserved = r.read_u8()?;
108
109            edges.push(EdgeRecord {
110                target_node,
111                edge_type,
112                weight,
113                flags: edge_flags,
114                reserved,
115            });
116        }
117
118        // Edge CSR index
119        let mut edge_index = Vec::with_capacity(node_count + 1);
120        for _ in 0..=node_count {
121            edge_index.push(r.read_u32::<LittleEndian>()?);
122        }
123
124        // ─── Feature Matrix ──────────────────────────────
125        let mut features = Vec::with_capacity(node_count);
126        for _ in 0..node_count {
127            let mut feat = [0.0f32; FEATURE_DIM];
128            for f in &mut feat {
129                *f = r.read_f32::<LittleEndian>()?;
130            }
131            features.push(feat);
132        }
133
134        // ─── Action Catalog ──────────────────────────────
135        let action_count = r.read_u32::<LittleEndian>()? as usize;
136        let mut actions = Vec::with_capacity(action_count);
137        for _ in 0..action_count {
138            let opcode_raw = r.read_u16::<LittleEndian>()?;
139            let target_node = r.read_i32::<LittleEndian>()?;
140            let cost_hint = r.read_u8()?;
141            let risk = r.read_u8()?;
142
143            actions.push(ActionRecord {
144                opcode: OpCode::from_u16(opcode_raw),
145                target_node,
146                cost_hint,
147                risk,
148                http_executable: false, // Binary format doesn't store this yet
149            });
150        }
151
152        // Action CSR index
153        let mut action_index = Vec::with_capacity(node_count + 1);
154        for _ in 0..=node_count {
155            action_index.push(r.read_u32::<LittleEndian>()?);
156        }
157
158        // ─── Cluster Table ───────────────────────────────
159        let mut cluster_assignments = Vec::with_capacity(node_count);
160        for _ in 0..node_count {
161            cluster_assignments.push(r.read_u16::<LittleEndian>()?);
162        }
163        let mut cluster_centroids = Vec::with_capacity(cluster_count);
164        for _ in 0..cluster_count {
165            let mut centroid = [0.0f32; FEATURE_DIM];
166            for f in &mut centroid {
167                *f = r.read_f32::<LittleEndian>()?;
168            }
169            cluster_centroids.push(centroid);
170        }
171
172        // ─── URL Table ───────────────────────────────────
173        let url_data_len = r.read_u32::<LittleEndian>()? as usize;
174        let mut url_data = vec![0u8; url_data_len];
175        std::io::Read::read_exact(&mut r, &mut url_data)?;
176
177        let mut url_offsets = Vec::with_capacity(node_count);
178        for _ in 0..node_count {
179            url_offsets.push(r.read_u32::<LittleEndian>()? as usize);
180        }
181
182        // Parse URLs from null-terminated strings
183        let mut urls = Vec::with_capacity(node_count);
184        for &offset in &url_offsets {
185            let end = url_data[offset..]
186                .iter()
187                .position(|&b| b == 0)
188                .map(|p| offset + p)
189                .unwrap_or(url_data_len);
190            let url = String::from_utf8_lossy(&url_data[offset..end]).to_string();
191            urls.push(url);
192        }
193
194        let header = MapHeader {
195            magic,
196            format_version,
197            domain,
198            mapped_at,
199            node_count: node_count as u32,
200            edge_count: edge_count as u32,
201            cluster_count: cluster_count as u16,
202            flags,
203        };
204
205        Ok(SiteMap {
206            header,
207            nodes,
208            edges,
209            edge_index,
210            features,
211            actions,
212            action_index,
213            cluster_assignments,
214            cluster_centroids,
215            urls,
216        })
217    }
218}