Skip to main content

shape_vm/bytecode/
content_addressed.rs

1use super::*;
2use crate::type_tracking::{FrameDescriptor, StorageHint};
3
4#[derive(Clone, Copy, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
5pub struct FunctionHash(pub [u8; 32]);
6
7impl std::fmt::Debug for FunctionHash {
8    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
9        write!(f, "FunctionHash({})", self)
10    }
11}
12
13impl std::fmt::Display for FunctionHash {
14    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
15        for byte in &self.0 {
16            write!(f, "{:02x}", byte)?;
17        }
18        Ok(())
19    }
20}
21
22impl FunctionHash {
23    /// The zero hash, used as a sentinel/placeholder.
24    pub const ZERO: Self = Self([0u8; 32]);
25}
26
27/// A self-contained, content-addressed function blob.
28///
29/// Each blob carries its own instructions, constants, and strings (no shared
30/// pools). The `content_hash` is the SHA-256 of the serialized content fields,
31/// making deduplication and caching trivial.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct FunctionBlob {
34    /// SHA-256 hash of the serialized content (everything below).
35    pub content_hash: FunctionHash,
36
37    // -- metadata --
38    pub name: String,
39    pub arity: u16,
40    pub param_names: Vec<String>,
41    pub locals_count: u16,
42    pub is_closure: bool,
43    pub captures_count: u16,
44    pub is_async: bool,
45    #[serde(default)]
46    pub ref_params: Vec<bool>,
47    #[serde(default)]
48    pub ref_mutates: Vec<bool>,
49    #[serde(default)]
50    pub mutable_captures: Vec<bool>,
51
52    // -- code --
53    /// This function's bytecode instructions.
54    pub instructions: Vec<Instruction>,
55    /// This function's constant pool.
56    pub constants: Vec<Constant>,
57    /// This function's string pool.
58    pub strings: Vec<String>,
59
60    // -- permissions --
61    /// Permissions required by this function (from capability_tags analysis).
62    #[serde(default = "default_permission_set")]
63    pub required_permissions: PermissionSet,
64
65    // -- dependency graph --
66    /// Content hashes of functions this blob references
67    /// (`Operand::Function(idx)` indexes into this vector).
68    pub dependencies: Vec<FunctionHash>,
69
70    /// Callee names corresponding to each dependency entry.
71    /// Used during compilation to resolve forward references; not serialized.
72    #[serde(skip, default)]
73    pub callee_names: Vec<String>,
74
75    // -- type info --
76    /// Type names this function constructs (schema references).
77    pub type_schemas: Vec<String>,
78
79    // -- foreign function dependencies --
80    /// Content hashes of foreign functions referenced by `CallForeign` opcodes.
81    /// Sorted and deduplicated for deterministic hashing.
82    #[serde(default)]
83    pub foreign_dependencies: Vec<[u8; 32]>,
84
85    // -- debug --
86    /// Source mapping entries local to this blob:
87    /// `(local_instruction_offset, file_id, line)`.
88    pub source_map: Vec<(usize, u32, u32)>,
89}
90
91/// Helper struct for deterministic content hashing.
92/// We serialize exactly the fields that define the function's identity.
93#[derive(Serialize)]
94struct FunctionBlobHashInput<'a> {
95    name: &'a str,
96    arity: u16,
97    param_names: &'a [String],
98    locals_count: u16,
99    is_closure: bool,
100    captures_count: u16,
101    is_async: bool,
102    ref_params: &'a [bool],
103    ref_mutates: &'a [bool],
104    mutable_captures: &'a [bool],
105    instructions: &'a [Instruction],
106    constants: &'a [Constant],
107    strings: &'a [String],
108    dependencies: &'a [FunctionHash],
109    type_schemas: &'a [String],
110    /// Permission names sorted deterministically for stable hashing.
111    required_permission_names: Vec<&'a str>,
112    /// Content hashes of foreign functions referenced by this blob.
113    foreign_dependencies: &'a [[u8; 32]],
114}
115
116impl FunctionBlob {
117    /// Compute the content hash from the blob's fields.
118    /// Call this after populating all fields, then assign the result to `content_hash`.
119    pub fn compute_hash(&self) -> FunctionHash {
120        // Convert PermissionSet to sorted permission names for deterministic hashing.
121        let perm_names: Vec<&str> = self.required_permissions.iter().map(|p| p.name()).collect();
122        let input = FunctionBlobHashInput {
123            name: &self.name,
124            arity: self.arity,
125            param_names: &self.param_names,
126            locals_count: self.locals_count,
127            is_closure: self.is_closure,
128            captures_count: self.captures_count,
129            is_async: self.is_async,
130            ref_params: &self.ref_params,
131            ref_mutates: &self.ref_mutates,
132            mutable_captures: &self.mutable_captures,
133            instructions: &self.instructions,
134            constants: &self.constants,
135            strings: &self.strings,
136            dependencies: &self.dependencies,
137            type_schemas: &self.type_schemas,
138            required_permission_names: perm_names,
139            foreign_dependencies: &self.foreign_dependencies,
140        };
141        // Use bincode-compatible MessagePack for deterministic serialization.
142        // rmp_serde::encode::to_vec uses the struct-as-array format which is
143        // order-preserving and deterministic for the types we use here.
144        let bytes = rmp_serde::encode::to_vec(&input)
145            .expect("FunctionBlob content serialization should not fail");
146        let digest = Sha256::digest(&bytes);
147        let mut hash = [0u8; 32];
148        hash.copy_from_slice(&digest);
149        FunctionHash(hash)
150    }
151
152    /// Build a blob with all fields set, then compute and assign its content hash.
153    pub fn finalize(&mut self) {
154        self.content_hash = self.compute_hash();
155    }
156}
157
158/// A content-addressed program: a set of `FunctionBlob`s plus program-level metadata.
159///
160/// This is the **storage / cache** representation. Before execution the linker
161/// flattens it into a `LinkedProgram`.
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct Program {
164    /// Hash of the entry-point function.
165    pub entry: FunctionHash,
166
167    /// All function blobs keyed by content hash.
168    pub function_store: HashMap<FunctionHash, FunctionBlob>,
169
170    /// Number of locals used by top-level code.
171    pub top_level_locals_count: u16,
172
173    /// Storage hints for top-level locals.
174    #[serde(default)]
175    pub top_level_local_storage_hints: Vec<StorageHint>,
176
177    /// Module-binding variable names (index -> name).
178    pub module_binding_names: Vec<String>,
179
180    /// Storage hints for module bindings.
181    #[serde(default)]
182    pub module_binding_storage_hints: Vec<StorageHint>,
183
184    /// Per-function local storage hints.
185    #[serde(default)]
186    pub function_local_storage_hints: Vec<Vec<StorageHint>>,
187
188    /// Typed frame layout for top-level locals.
189    #[serde(default)]
190    pub top_level_frame: Option<FrameDescriptor>,
191
192    /// DataFrame schema for column name resolution.
193    pub data_schema: Option<DataFrameSchema>,
194
195    /// Type schema registry for TypedObject field resolution.
196    #[serde(default)]
197    pub type_schema_registry: shape_runtime::type_schema::TypeSchemaRegistry,
198
199    /// Trait method dispatch registry.
200    pub trait_method_symbols: HashMap<String, String>,
201
202    /// Foreign function metadata table.
203    #[serde(default)]
204    pub foreign_functions: Vec<ForeignFunctionEntry>,
205
206    /// Native `type C` layout metadata table.
207    #[serde(default)]
208    pub native_struct_layouts: Vec<NativeStructLayoutEntry>,
209
210    /// Debug information (source files, variable names).
211    pub debug_info: DebugInfo,
212}
213
214/// A linked function ready for execution in a flat instruction array.
215///
216/// Mirrors `Function` but adds `blob_hash` so the runtime can trace back
217/// to the original content-addressed blob.
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct LinkedFunction {
220    /// Content hash of the `FunctionBlob` this was linked from.
221    pub blob_hash: FunctionHash,
222
223    /// Offset into the flat `LinkedProgram::instructions` array.
224    pub entry_point: usize,
225    /// Number of instructions in this function's body.
226    pub body_length: usize,
227
228    // -- metadata (same as Function) --
229    pub name: String,
230    pub arity: u16,
231    pub param_names: Vec<String>,
232    pub locals_count: u16,
233    pub is_closure: bool,
234    pub captures_count: u16,
235    pub is_async: bool,
236    #[serde(default)]
237    pub ref_params: Vec<bool>,
238    #[serde(default)]
239    pub ref_mutates: Vec<bool>,
240    #[serde(default)]
241    pub mutable_captures: Vec<bool>,
242    /// Typed frame layout for this function's locals.
243    #[serde(default)]
244    pub frame_descriptor: Option<FrameDescriptor>,
245}
246
247/// A linked, execution-ready program with flat instruction/constant/string arrays.
248///
249/// This mirrors today's `BytecodeProgram` layout so the executor can run it
250/// with minimal changes. Produced by the linker from a `Program`.
251#[derive(Debug, Clone, Default, Serialize, Deserialize)]
252pub struct LinkedProgram {
253    /// Hash of the entry-point function for execution.
254    #[serde(default)]
255    pub entry: FunctionHash,
256
257    /// Flat instruction array (all functions concatenated).
258    pub instructions: Vec<Instruction>,
259
260    /// Merged constant pool.
261    pub constants: Vec<Constant>,
262
263    /// Merged string pool.
264    pub strings: Vec<String>,
265
266    /// Linked function table (replaces `Vec<Function>`).
267    pub functions: Vec<LinkedFunction>,
268
269    /// Reverse lookup: content hash -> function index in `functions`.
270    pub hash_to_id: HashMap<FunctionHash, usize>,
271
272    /// Debug information.
273    pub debug_info: DebugInfo,
274
275    /// DataFrame schema for column name resolution.
276    pub data_schema: Option<DataFrameSchema>,
277
278    /// Module-binding variable names.
279    pub module_binding_names: Vec<String>,
280
281    /// Number of locals used by top-level code.
282    pub top_level_locals_count: u16,
283
284    /// Storage hints for top-level locals.
285    #[serde(default)]
286    pub top_level_local_storage_hints: Vec<StorageHint>,
287
288    /// Type schema registry for TypedObject field resolution.
289    #[serde(default)]
290    pub type_schema_registry: shape_runtime::type_schema::TypeSchemaRegistry,
291
292    /// Storage hints for module bindings.
293    #[serde(default)]
294    pub module_binding_storage_hints: Vec<StorageHint>,
295
296    /// Per-function local storage hints.
297    #[serde(default)]
298    pub function_local_storage_hints: Vec<Vec<StorageHint>>,
299
300    /// Typed frame layout for top-level locals.
301    #[serde(default)]
302    pub top_level_frame: Option<FrameDescriptor>,
303
304    /// Trait method dispatch registry.
305    pub trait_method_symbols: HashMap<String, String>,
306
307    /// Foreign function metadata table.
308    #[serde(default)]
309    pub foreign_functions: Vec<ForeignFunctionEntry>,
310
311    /// Native `type C` layout metadata table.
312    #[serde(default)]
313    pub native_struct_layouts: Vec<NativeStructLayoutEntry>,
314
315    /// Transitive union of all required permissions across all blobs.
316    /// Computed by the linker during `link()`.
317    #[serde(default = "default_permission_set")]
318    pub total_required_permissions: PermissionSet,
319}