Skip to main content

agentic_codebase/format/
writer.rs

1//! Writes `.acb` files from a `CodeGraph`.
2//!
3//! The writer serializes the graph into the binary `.acb` format:
4//! header, unit table, edge table, compressed string pool, feature vectors,
5//! temporal block, and index block.
6
7use std::io::Write;
8use std::path::Path;
9
10use crate::graph::CodeGraph;
11use crate::types::header::{FileHeader, HEADER_SIZE};
12use crate::types::{AcbResult, DEFAULT_DIMENSION};
13
14use super::compression::StringPoolBuilder;
15
16/// Size of one code unit record on disk (96 bytes).
17pub const UNIT_RECORD_SIZE: usize = 96;
18
19/// Size of one edge record on disk (40 bytes).
20pub const EDGE_RECORD_SIZE: usize = 40;
21
22/// Writes `CodeGraph` instances to `.acb` binary format.
23pub struct AcbWriter {
24    dimension: usize,
25}
26
27impl AcbWriter {
28    /// Create a new writer with the given feature vector dimension.
29    pub fn new(dimension: usize) -> Self {
30        Self { dimension }
31    }
32
33    /// Create a writer with default dimension.
34    pub fn with_default_dimension() -> Self {
35        Self::new(DEFAULT_DIMENSION)
36    }
37
38    /// Write a code graph to a file path.
39    ///
40    /// # Errors
41    ///
42    /// Returns `AcbError::Io` on write failure.
43    pub fn write_to_file(&self, graph: &CodeGraph, path: &Path) -> AcbResult<()> {
44        let mut file = std::fs::File::create(path)?;
45        self.write_to(graph, &mut file)
46    }
47
48    /// Write a code graph to any writer.
49    pub fn write_to(&self, graph: &CodeGraph, w: &mut impl Write) -> AcbResult<()> {
50        // 1. Build string pool, collecting offsets for each unit
51        let mut pool = StringPoolBuilder::new();
52        let mut unit_strings: Vec<UnitStrings> = Vec::with_capacity(graph.unit_count());
53
54        for unit in graph.units() {
55            let (name_offset, name_len) = pool.add(&unit.name);
56            let (qname_offset, qname_len) = pool.add(&unit.qualified_name);
57            let path_str = unit.file_path.to_string_lossy();
58            let (path_offset, path_len) = pool.add(&path_str);
59            unit_strings.push(UnitStrings {
60                name_offset,
61                name_len,
62                qname_offset,
63                qname_len,
64                path_offset,
65                path_len,
66            });
67        }
68
69        let compressed_pool = pool.compress();
70
71        // 2. Sort edges by source_id then target_id for contiguous grouping
72        let mut sorted_edges: Vec<_> = graph.edges().to_vec();
73        sorted_edges.sort_by(|a, b| {
74            a.source_id
75                .cmp(&b.source_id)
76                .then(a.target_id.cmp(&b.target_id))
77        });
78
79        // Compute edge offsets per unit
80        let mut unit_edge_offsets: Vec<(u64, u32)> = vec![(0, 0); graph.unit_count()];
81        {
82            let mut current_source = u64::MAX;
83            let mut current_offset = 0u64;
84            let mut current_count = 0u32;
85
86            for (i, edge) in sorted_edges.iter().enumerate() {
87                if edge.source_id != current_source {
88                    if current_source != u64::MAX {
89                        unit_edge_offsets[current_source as usize] =
90                            (current_offset, current_count);
91                    }
92                    current_source = edge.source_id;
93                    current_offset = (i as u64) * EDGE_RECORD_SIZE as u64;
94                    current_count = 0;
95                }
96                current_count += 1;
97            }
98            if current_source != u64::MAX {
99                unit_edge_offsets[current_source as usize] = (current_offset, current_count);
100            }
101        }
102
103        // 3. Calculate section offsets
104        let unit_table_offset = HEADER_SIZE as u64;
105        let edge_table_offset =
106            unit_table_offset + (graph.unit_count() as u64) * UNIT_RECORD_SIZE as u64;
107        let string_pool_offset =
108            edge_table_offset + (sorted_edges.len() as u64) * EDGE_RECORD_SIZE as u64;
109        // String pool section: 8 bytes uncompressed size + compressed data
110        let string_pool_section_size = 8 + compressed_pool.len() as u64;
111        let feature_vec_offset = string_pool_offset + string_pool_section_size;
112        let feature_vec_size = (graph.unit_count() as u64) * (self.dimension as u64) * 4;
113        let temporal_offset = feature_vec_offset + feature_vec_size;
114        // Empty temporal block for now (just 16 bytes: two u64 zeros)
115        let temporal_size = 16u64;
116        let index_offset = temporal_offset + temporal_size;
117
118        // 4. Build header
119        let mut header = FileHeader::new(self.dimension as u32);
120        header.unit_count = graph.unit_count() as u64;
121        header.edge_count = sorted_edges.len() as u64;
122        header.language_count = graph.languages().len() as u32;
123        header.unit_table_offset = unit_table_offset;
124        header.edge_table_offset = edge_table_offset;
125        header.string_pool_offset = string_pool_offset;
126        header.feature_vec_offset = feature_vec_offset;
127        header.temporal_offset = temporal_offset;
128        header.index_offset = index_offset;
129
130        // 5. Write header
131        header.write_to(w)?;
132
133        // 6. Write unit table
134        for (i, unit) in graph.units().iter().enumerate() {
135            let us = &unit_strings[i];
136            let (eoff, ecnt) = unit_edge_offsets[i];
137            write_unit_record(w, unit, us, eoff, ecnt)?;
138        }
139
140        // 7. Write edge table
141        for edge in &sorted_edges {
142            write_edge_record(w, edge)?;
143        }
144
145        // 8. Write string pool (uncompressed size + compressed data)
146        w.write_all(&(pool.uncompressed_size() as u64).to_le_bytes())?;
147        w.write_all(&compressed_pool)?;
148
149        // 9. Write feature vectors
150        for unit in graph.units() {
151            for &val in &unit.feature_vec {
152                w.write_all(&val.to_le_bytes())?;
153            }
154            // Pad if vector is shorter than dimension
155            for _ in unit.feature_vec.len()..self.dimension {
156                w.write_all(&0.0f32.to_le_bytes())?;
157            }
158        }
159
160        // 10. Write temporal block (empty placeholder)
161        w.write_all(&0u64.to_le_bytes())?; // history_size = 0
162        w.write_all(&0u64.to_le_bytes())?; // coupling_count = 0
163
164        // 11. Write index block (end marker only for now)
165        w.write_all(&0xFFFFFFFFu32.to_le_bytes())?;
166
167        Ok(())
168    }
169}
170
171/// Intermediate struct for string pool references.
172struct UnitStrings {
173    name_offset: u32,
174    name_len: u16,
175    qname_offset: u32,
176    qname_len: u16,
177    path_offset: u32,
178    path_len: u16,
179}
180
181/// Write a 96-byte code unit record.
182fn write_unit_record(
183    w: &mut impl Write,
184    unit: &crate::types::CodeUnit,
185    strings: &UnitStrings,
186    edge_offset: u64,
187    edge_count: u32,
188) -> AcbResult<()> {
189    // Identity: 16 bytes
190    w.write_all(&unit.id.to_le_bytes())?; // 8
191    w.write_all(&[unit.unit_type as u8])?; // 1
192    w.write_all(&[unit.language as u8])?; // 1
193    w.write_all(&[unit.visibility as u8])?; // 1
194    let flags: u8 = (unit.is_async as u8) | ((unit.is_generator as u8) << 1);
195    w.write_all(&[flags])?; // 1
196    let complexity_u16 = unit.complexity as u16;
197    w.write_all(&complexity_u16.to_le_bytes())?; // 2
198    w.write_all(&[0u8; 2])?; // _pad1: 2
199
200    // String references: 24 bytes
201    w.write_all(&strings.name_offset.to_le_bytes())?; // 4
202    w.write_all(&strings.name_len.to_le_bytes())?; // 2
203    w.write_all(&strings.qname_offset.to_le_bytes())?; // 4
204    w.write_all(&strings.qname_len.to_le_bytes())?; // 2
205    w.write_all(&strings.path_offset.to_le_bytes())?; // 4
206    w.write_all(&strings.path_len.to_le_bytes())?; // 2
207    w.write_all(&[0u8; 6])?; // _pad2: 6
208
209    // Source location: 16 bytes
210    w.write_all(&unit.span.start_line.to_le_bytes())?; // 4
211    let start_col_u16 = unit.span.start_col as u16;
212    w.write_all(&start_col_u16.to_le_bytes())?; // 2
213    w.write_all(&unit.span.end_line.to_le_bytes())?; // 4
214    let end_col_u16 = unit.span.end_col as u16;
215    w.write_all(&end_col_u16.to_le_bytes())?; // 2
216    w.write_all(&[0u8; 4])?; // _pad3: 4
217
218    // Temporal: 24 bytes
219    w.write_all(&unit.created_at.to_le_bytes())?; // 8
220    w.write_all(&unit.last_modified.to_le_bytes())?; // 8
221    let change_count_u32 = unit.change_count;
222    w.write_all(&change_count_u32.to_le_bytes())?; // 4
223    let stability_x100 = (unit.stability_score * 100.0).round() as u16;
224    w.write_all(&stability_x100.to_le_bytes())?; // 2
225    w.write_all(&[0u8; 2])?; // _pad4: 2
226
227    // Graph: 16 bytes
228    w.write_all(&edge_offset.to_le_bytes())?; // 8
229    w.write_all(&edge_count.to_le_bytes())?; // 4
230    w.write_all(&[0u8; 4])?; // _pad5: 4
231
232    Ok(())
233}
234
235/// Write a 40-byte edge record.
236fn write_edge_record(w: &mut impl Write, edge: &crate::types::Edge) -> AcbResult<()> {
237    w.write_all(&edge.source_id.to_le_bytes())?; // 8
238    w.write_all(&edge.target_id.to_le_bytes())?; // 8
239    w.write_all(&[edge.edge_type as u8])?; // 1
240    w.write_all(&[0u8; 3])?; // _pad1: 3
241    w.write_all(&edge.weight.to_bits().to_le_bytes())?; // 4
242    w.write_all(&edge.created_at.to_le_bytes())?; // 8
243    w.write_all(&edge.context.to_le_bytes())?; // 4
244    w.write_all(&[0u8; 4])?; // _pad2: 4
245
246    Ok(())
247}