Skip to main content

agentic_codebase/format/
reader.rs

1//! Reads `.acb` files into a `CodeGraph`.
2//!
3//! The reader deserializes the binary `.acb` format back into the
4//! in-memory graph structure.
5
6use std::io::Read;
7use std::path::{Path, PathBuf};
8
9use crate::graph::CodeGraph;
10use crate::types::header::{FileHeader, HEADER_SIZE};
11use crate::types::{
12    AcbError, AcbResult, CodeUnit, CodeUnitType, Edge, EdgeType, Language, Span, Visibility,
13    ACB_MAGIC, FORMAT_VERSION,
14};
15
16use super::compression::StringPool;
17use super::writer::{EDGE_RECORD_SIZE, UNIT_RECORD_SIZE};
18use super::AcbWriter;
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21enum StorageMigrationPolicy {
22    AutoSafe,
23    Strict,
24    Off,
25}
26
27impl StorageMigrationPolicy {
28    fn from_env(name: &str) -> Self {
29        let raw = std::env::var(name).unwrap_or_else(|_| "auto-safe".to_string());
30        match raw.trim().to_ascii_lowercase().as_str() {
31            "strict" => Self::Strict,
32            "off" | "disabled" | "none" => Self::Off,
33            _ => Self::AutoSafe,
34        }
35    }
36}
37
38/// Reads `.acb` files into `CodeGraph` instances.
39pub struct AcbReader;
40
41impl AcbReader {
42    /// Read a code graph from a file path.
43    ///
44    /// # Errors
45    ///
46    /// Returns errors on I/O failure, corrupt data, or version mismatch.
47    pub fn read_from_file(path: &Path) -> AcbResult<CodeGraph> {
48        if !path.exists() {
49            return Err(AcbError::PathNotFound(path.to_path_buf()));
50        }
51        let data = std::fs::read(path)?;
52        if data.len() < HEADER_SIZE {
53            return Err(AcbError::Truncated);
54        }
55        let legacy_version = detect_legacy_version(&data);
56        let migration_policy = StorageMigrationPolicy::from_env("ACB_STORAGE_MIGRATION_POLICY");
57        if let Some(from_version) = legacy_version {
58            if migration_policy == StorageMigrationPolicy::Strict {
59                return Err(AcbError::UnsupportedVersion(from_version));
60            }
61        }
62
63        let graph = Self::read_from_data(&data)?;
64        if let Some(from_version) = legacy_version {
65            match migration_policy {
66                StorageMigrationPolicy::AutoSafe => {
67                    if let Err(err) = migrate_file_in_place(path, &graph, from_version) {
68                        tracing::warn!(
69                            "Failed to auto-migrate {} from v{}: {}",
70                            path.display(),
71                            from_version,
72                            err
73                        );
74                    }
75                }
76                StorageMigrationPolicy::Off => {
77                    tracing::warn!(
78                        "Legacy .acb version {} loaded for {} with migration disabled",
79                        from_version,
80                        path.display()
81                    );
82                }
83                StorageMigrationPolicy::Strict => {}
84            }
85        }
86        Ok(graph)
87    }
88
89    /// Read a code graph from a byte slice.
90    pub fn read_from_data(data: &[u8]) -> AcbResult<CodeGraph> {
91        if data.len() < HEADER_SIZE {
92            return Err(AcbError::Truncated);
93        }
94
95        // 1. Read header
96        let header_bytes: [u8; HEADER_SIZE] = data[..HEADER_SIZE]
97            .try_into()
98            .map_err(|_| AcbError::Truncated)?;
99        let header = FileHeader::from_bytes(&header_bytes)?;
100
101        // Validate offsets are within file bounds
102        let file_len = data.len() as u64;
103        validate_offset(header.unit_table_offset, file_len)?;
104        if header.unit_count > 0 {
105            let unit_table_end =
106                header.unit_table_offset + header.unit_count * UNIT_RECORD_SIZE as u64;
107            if unit_table_end > file_len {
108                return Err(AcbError::Truncated);
109            }
110        }
111        if header.edge_count > 0 {
112            validate_offset(header.edge_table_offset, file_len)?;
113            let edge_table_end =
114                header.edge_table_offset + header.edge_count * EDGE_RECORD_SIZE as u64;
115            if edge_table_end > file_len {
116                return Err(AcbError::Truncated);
117            }
118        }
119
120        // 2. Read string pool
121        let pool = if header.string_pool_offset > 0 && header.string_pool_offset < file_len {
122            let pool_start = header.string_pool_offset as usize;
123            if pool_start + 8 > data.len() {
124                return Err(AcbError::Truncated);
125            }
126            let _uncompressed_size =
127                u64::from_le_bytes(data[pool_start..pool_start + 8].try_into().unwrap());
128            let compressed_data = &data[pool_start + 8..];
129            // The compressed data extends until the feature_vec_offset
130            let compressed_end = if header.feature_vec_offset > 0 {
131                (header.feature_vec_offset as usize).saturating_sub(pool_start + 8)
132            } else {
133                compressed_data.len()
134            };
135            let compressed_slice = &compressed_data[..compressed_end.min(compressed_data.len())];
136            StringPool::from_compressed(compressed_slice)?
137        } else {
138            StringPool::from_data(Vec::new())
139        };
140
141        // 3. Read unit table
142        let mut graph = CodeGraph::new(header.dimension as usize);
143        let mut unit_edge_info: Vec<(u64, u32)> = Vec::with_capacity(header.unit_count as usize);
144
145        for i in 0..header.unit_count {
146            let offset = header.unit_table_offset as usize + (i as usize) * UNIT_RECORD_SIZE;
147            let record = &data[offset..offset + UNIT_RECORD_SIZE];
148            let (unit, edge_offset, edge_count) = read_unit_record(record, &pool)?;
149            unit_edge_info.push((edge_offset, edge_count));
150            graph.add_unit(unit);
151        }
152
153        // 4. Read edge table
154        for i in 0..header.edge_count {
155            let offset = header.edge_table_offset as usize + (i as usize) * EDGE_RECORD_SIZE;
156            let record = &data[offset..offset + EDGE_RECORD_SIZE];
157            let edge = read_edge_record(record)?;
158            // Lenient: skip invalid edges rather than failing the entire read
159            if let Err(e) = graph.add_edge(edge) {
160                tracing::warn!("Skipping invalid edge during read: {}", e);
161            }
162        }
163
164        // 5. Read feature vectors
165        if header.feature_vec_offset > 0 && header.feature_vec_offset < file_len {
166            let dim = header.dimension as usize;
167            for i in 0..header.unit_count {
168                let vec_offset = header.feature_vec_offset as usize + (i as usize) * dim * 4;
169                if vec_offset + dim * 4 <= data.len() {
170                    let mut fv = Vec::with_capacity(dim);
171                    for d in 0..dim {
172                        let fo = vec_offset + d * 4;
173                        let val = f32::from_le_bytes(data[fo..fo + 4].try_into().unwrap());
174                        fv.push(val);
175                    }
176                    if let Some(unit) = graph.get_unit_mut(i) {
177                        unit.feature_vec = fv;
178                    }
179                }
180            }
181        }
182
183        Ok(graph)
184    }
185
186    /// Read a code graph from a reader (consumes all bytes).
187    pub fn read_from(reader: &mut impl Read) -> AcbResult<CodeGraph> {
188        let mut data = Vec::new();
189        reader.read_to_end(&mut data)?;
190        Self::read_from_data(&data)
191    }
192}
193
194fn validate_offset(offset: u64, file_len: u64) -> AcbResult<()> {
195    if offset > file_len {
196        Err(AcbError::Truncated)
197    } else {
198        Ok(())
199    }
200}
201
202fn detect_legacy_version(data: &[u8]) -> Option<u32> {
203    if data.len() < 8 {
204        return None;
205    }
206    if data[0..4] != ACB_MAGIC {
207        return None;
208    }
209    let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
210    if version < FORMAT_VERSION {
211        Some(version)
212    } else {
213        None
214    }
215}
216
217fn migrate_file_in_place(path: &Path, graph: &CodeGraph, from_version: u32) -> AcbResult<()> {
218    let migration_dir = path
219        .parent()
220        .unwrap_or_else(|| Path::new("."))
221        .join(".acb-migrations");
222    std::fs::create_dir_all(&migration_dir)?;
223
224    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("graph");
225    let ts = chrono::Utc::now().format("%Y%m%d%H%M%S");
226    let checkpoint = migration_dir.join(format!("{stem}.v{from_version}.{ts}.acb.checkpoint"));
227    std::fs::copy(path, &checkpoint)?;
228
229    let writer = AcbWriter::new(graph.dimension());
230    writer.write_to_file(graph, path)?;
231    tracing::info!(
232        "Auto-migrated {} from v{} to v{} (checkpoint: {})",
233        path.display(),
234        from_version,
235        FORMAT_VERSION,
236        checkpoint.display()
237    );
238    Ok(())
239}
240
241/// Read a 96-byte code unit record from a slice.
242fn read_unit_record(data: &[u8], pool: &StringPool) -> AcbResult<(CodeUnit, u64, u32)> {
243    let id = u64::from_le_bytes(data[0..8].try_into().unwrap());
244    let unit_type = CodeUnitType::from_u8(data[8]).ok_or(AcbError::Corrupt(0))?;
245    let language = Language::from_u8(data[9]).ok_or(AcbError::Corrupt(1))?;
246    let visibility = Visibility::from_u8(data[10]).ok_or(AcbError::Corrupt(2))?;
247    let flags = data[11];
248    let is_async = (flags & 1) != 0;
249    let is_generator = (flags & 2) != 0;
250    let complexity = u16::from_le_bytes(data[12..14].try_into().unwrap()) as u32;
251    // _pad1: 14..16
252
253    // String references
254    let name_offset = u32::from_le_bytes(data[16..20].try_into().unwrap());
255    let name_len = u16::from_le_bytes(data[20..22].try_into().unwrap());
256    let qname_offset = u32::from_le_bytes(data[22..26].try_into().unwrap());
257    let qname_len = u16::from_le_bytes(data[26..28].try_into().unwrap());
258    let path_offset = u32::from_le_bytes(data[28..32].try_into().unwrap());
259    let path_len = u16::from_le_bytes(data[32..34].try_into().unwrap());
260    // _pad2: 34..40
261
262    // Source location
263    let start_line = u32::from_le_bytes(data[40..44].try_into().unwrap());
264    let start_col = u16::from_le_bytes(data[44..46].try_into().unwrap()) as u32;
265    let end_line = u32::from_le_bytes(data[46..50].try_into().unwrap());
266    let end_col = u16::from_le_bytes(data[50..52].try_into().unwrap()) as u32;
267    // _pad3: 52..56
268
269    // Temporal
270    let created_at = u64::from_le_bytes(data[56..64].try_into().unwrap());
271    let last_modified = u64::from_le_bytes(data[64..72].try_into().unwrap());
272    let change_count = u32::from_le_bytes(data[72..76].try_into().unwrap());
273    let stability_x100 = u16::from_le_bytes(data[76..78].try_into().unwrap());
274    let stability_score = stability_x100 as f32 / 100.0;
275    // _pad4: 78..80
276
277    // Graph
278    let edge_offset = u64::from_le_bytes(data[80..88].try_into().unwrap());
279    let edge_count = u32::from_le_bytes(data[88..92].try_into().unwrap());
280    // _pad5: 92..96
281
282    // Resolve strings from pool
283    let name = if name_len > 0 {
284        pool.get(name_offset, name_len)?.to_string()
285    } else {
286        String::new()
287    };
288    let qualified_name = if qname_len > 0 {
289        pool.get(qname_offset, qname_len)?.to_string()
290    } else {
291        String::new()
292    };
293    let file_path = if path_len > 0 {
294        PathBuf::from(pool.get(path_offset, path_len)?)
295    } else {
296        PathBuf::new()
297    };
298
299    let mut unit = CodeUnit::new(
300        unit_type,
301        language,
302        name,
303        qualified_name,
304        file_path,
305        Span::new(start_line, start_col, end_line, end_col),
306    );
307    unit.id = id;
308    unit.visibility = visibility;
309    unit.is_async = is_async;
310    unit.is_generator = is_generator;
311    unit.complexity = complexity;
312    unit.created_at = created_at;
313    unit.last_modified = last_modified;
314    unit.change_count = change_count;
315    unit.stability_score = stability_score;
316
317    Ok((unit, edge_offset, edge_count))
318}
319
320/// Read a 40-byte edge record from a slice.
321fn read_edge_record(data: &[u8]) -> AcbResult<Edge> {
322    let source_id = u64::from_le_bytes(data[0..8].try_into().unwrap());
323    let target_id = u64::from_le_bytes(data[8..16].try_into().unwrap());
324    let edge_type = EdgeType::from_u8(data[16]).ok_or(AcbError::Corrupt(16))?;
325    // _pad1: 17..20
326    let weight_bits = u32::from_le_bytes(data[20..24].try_into().unwrap());
327    let weight = f32::from_bits(weight_bits);
328    let created_at = u64::from_le_bytes(data[24..32].try_into().unwrap());
329    let context = u32::from_le_bytes(data[32..36].try_into().unwrap());
330    // _pad2: 36..40
331
332    Ok(Edge {
333        source_id,
334        target_id,
335        edge_type,
336        weight,
337        created_at,
338        context,
339    })
340}