Skip to main content

shape_vm/bytecode/
content_addressed.rs

1use super::*;
2use crate::type_tracking::{FrameDescriptor, StorageHint};
3
4#[derive(Clone, Copy, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
5pub struct FunctionHash(pub [u8; 32]);
6
7impl std::fmt::Debug for FunctionHash {
8    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
9        write!(f, "FunctionHash({})", self)
10    }
11}
12
13impl std::fmt::Display for FunctionHash {
14    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
15        for byte in &self.0 {
16            write!(f, "{:02x}", byte)?;
17        }
18        Ok(())
19    }
20}
21
22impl FunctionHash {
23    /// The zero hash, used as a sentinel/placeholder.
24    pub const ZERO: Self = Self([0u8; 32]);
25}
26
27/// A self-contained, content-addressed function blob.
28///
29/// Each blob carries its own instructions, constants, and strings (no shared
30/// pools). The `content_hash` is the SHA-256 of the serialized content fields,
31/// making deduplication and caching trivial.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct FunctionBlob {
34    /// SHA-256 hash of the serialized content (everything below).
35    pub content_hash: FunctionHash,
36
37    // -- metadata --
38    pub name: String,
39    pub arity: u16,
40    pub param_names: Vec<String>,
41    pub locals_count: u16,
42    pub is_closure: bool,
43    pub captures_count: u16,
44    pub is_async: bool,
45    #[serde(default)]
46    pub ref_params: Vec<bool>,
47    #[serde(default)]
48    pub ref_mutates: Vec<bool>,
49    #[serde(default)]
50    pub mutable_captures: Vec<bool>,
51    /// Typed frame layout for this function's locals (propagated from compiler).
52    #[serde(default)]
53    pub frame_descriptor: Option<FrameDescriptor>,
54
55    // -- code --
56    /// This function's bytecode instructions.
57    pub instructions: Vec<Instruction>,
58    /// This function's constant pool.
59    pub constants: Vec<Constant>,
60    /// This function's string pool.
61    pub strings: Vec<String>,
62
63    // -- permissions --
64    /// Permissions required by this function (from capability_tags analysis).
65    #[serde(default = "default_permission_set")]
66    pub required_permissions: PermissionSet,
67
68    // -- dependency graph --
69    /// Content hashes of functions this blob references
70    /// (`Operand::Function(idx)` indexes into this vector).
71    pub dependencies: Vec<FunctionHash>,
72
73    /// Callee names corresponding to each dependency entry.
74    /// Used during compilation to resolve forward references; not serialized.
75    #[serde(skip, default)]
76    pub callee_names: Vec<String>,
77
78    // -- type info --
79    /// Type names this function constructs (schema references).
80    pub type_schemas: Vec<String>,
81
82    // -- foreign function dependencies --
83    /// Content hashes of foreign functions referenced by `CallForeign` opcodes.
84    /// Sorted and deduplicated for deterministic hashing.
85    #[serde(default)]
86    pub foreign_dependencies: Vec<[u8; 32]>,
87
88    // -- debug --
89    /// Source mapping entries local to this blob:
90    /// `(local_instruction_offset, file_id, line)`.
91    pub source_map: Vec<(usize, u32, u32)>,
92}
93
94/// Helper struct for deterministic content hashing.
95/// We serialize exactly the fields that define the function's identity.
96#[derive(Serialize)]
97struct FunctionBlobHashInput<'a> {
98    name: &'a str,
99    arity: u16,
100    param_names: &'a [String],
101    locals_count: u16,
102    is_closure: bool,
103    captures_count: u16,
104    is_async: bool,
105    ref_params: &'a [bool],
106    ref_mutates: &'a [bool],
107    mutable_captures: &'a [bool],
108    instructions: &'a [Instruction],
109    constants: &'a [Constant],
110    strings: &'a [String],
111    dependencies: &'a [FunctionHash],
112    type_schemas: &'a [String],
113    /// Permission names sorted deterministically for stable hashing.
114    required_permission_names: Vec<&'a str>,
115    /// Content hashes of foreign functions referenced by this blob.
116    foreign_dependencies: &'a [[u8; 32]],
117}
118
119impl FunctionBlob {
120    /// Compute the content hash from the blob's fields.
121    /// Call this after populating all fields, then assign the result to `content_hash`.
122    pub fn compute_hash(&self) -> FunctionHash {
123        // Convert PermissionSet to sorted permission names for deterministic hashing.
124        let perm_names: Vec<&str> = self.required_permissions.iter().map(|p| p.name()).collect();
125        let input = FunctionBlobHashInput {
126            name: &self.name,
127            arity: self.arity,
128            param_names: &self.param_names,
129            locals_count: self.locals_count,
130            is_closure: self.is_closure,
131            captures_count: self.captures_count,
132            is_async: self.is_async,
133            ref_params: &self.ref_params,
134            ref_mutates: &self.ref_mutates,
135            mutable_captures: &self.mutable_captures,
136            instructions: &self.instructions,
137            constants: &self.constants,
138            strings: &self.strings,
139            dependencies: &self.dependencies,
140            type_schemas: &self.type_schemas,
141            required_permission_names: perm_names,
142            foreign_dependencies: &self.foreign_dependencies,
143        };
144        // Use bincode-compatible MessagePack for deterministic serialization.
145        // rmp_serde::encode::to_vec uses the struct-as-array format which is
146        // order-preserving and deterministic for the types we use here.
147        let bytes = rmp_serde::encode::to_vec(&input)
148            .expect("FunctionBlob content serialization should not fail");
149        let digest = Sha256::digest(&bytes);
150        let mut hash = [0u8; 32];
151        hash.copy_from_slice(&digest);
152        FunctionHash(hash)
153    }
154
155    /// Build a blob with all fields set, then compute and assign its content hash.
156    pub fn finalize(&mut self) {
157        self.content_hash = self.compute_hash();
158    }
159}
160
161/// A content-addressed program: a set of `FunctionBlob`s plus program-level metadata.
162///
163/// This is the **storage / cache** representation. Before execution the linker
164/// flattens it into a `LinkedProgram`.
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct Program {
167    /// Hash of the entry-point function.
168    pub entry: FunctionHash,
169
170    /// All function blobs keyed by content hash.
171    pub function_store: HashMap<FunctionHash, FunctionBlob>,
172
173    /// Number of locals used by top-level code.
174    pub top_level_locals_count: u16,
175
176    /// Storage hints for top-level locals.
177    #[serde(default)]
178    pub top_level_local_storage_hints: Vec<StorageHint>,
179
180    /// Module-binding variable names (index -> name).
181    pub module_binding_names: Vec<String>,
182
183    /// Storage hints for module bindings.
184    #[serde(default)]
185    pub module_binding_storage_hints: Vec<StorageHint>,
186
187    /// Per-function local storage hints.
188    #[serde(default)]
189    pub function_local_storage_hints: Vec<Vec<StorageHint>>,
190
191    /// Typed frame layout for top-level locals.
192    #[serde(default)]
193    pub top_level_frame: Option<FrameDescriptor>,
194
195    /// DataFrame schema for column name resolution.
196    pub data_schema: Option<DataFrameSchema>,
197
198    /// Type schema registry for TypedObject field resolution.
199    #[serde(default)]
200    pub type_schema_registry: shape_runtime::type_schema::TypeSchemaRegistry,
201
202    /// Trait method dispatch registry.
203    pub trait_method_symbols: HashMap<String, String>,
204
205    /// Foreign function metadata table.
206    #[serde(default)]
207    pub foreign_functions: Vec<ForeignFunctionEntry>,
208
209    /// Native `type C` layout metadata table.
210    #[serde(default)]
211    pub native_struct_layouts: Vec<NativeStructLayoutEntry>,
212
213    /// Debug information (source files, variable names).
214    pub debug_info: DebugInfo,
215}
216
217/// A linked function ready for execution in a flat instruction array.
218///
219/// Mirrors `Function` but adds `blob_hash` so the runtime can trace back
220/// to the original content-addressed blob.
221#[derive(Debug, Clone, Serialize, Deserialize)]
222pub struct LinkedFunction {
223    /// Content hash of the `FunctionBlob` this was linked from.
224    pub blob_hash: FunctionHash,
225
226    /// Offset into the flat `LinkedProgram::instructions` array.
227    pub entry_point: usize,
228    /// Number of instructions in this function's body.
229    pub body_length: usize,
230
231    // -- metadata (same as Function) --
232    pub name: String,
233    pub arity: u16,
234    pub param_names: Vec<String>,
235    pub locals_count: u16,
236    pub is_closure: bool,
237    pub captures_count: u16,
238    pub is_async: bool,
239    #[serde(default)]
240    pub ref_params: Vec<bool>,
241    #[serde(default)]
242    pub ref_mutates: Vec<bool>,
243    #[serde(default)]
244    pub mutable_captures: Vec<bool>,
245    /// Typed frame layout for this function's locals.
246    #[serde(default)]
247    pub frame_descriptor: Option<FrameDescriptor>,
248}
249
250/// A linked, execution-ready program with flat instruction/constant/string arrays.
251///
252/// This mirrors today's `BytecodeProgram` layout so the executor can run it
253/// with minimal changes. Produced by the linker from a `Program`.
254#[derive(Debug, Clone, Default, Serialize, Deserialize)]
255pub struct LinkedProgram {
256    /// Hash of the entry-point function for execution.
257    #[serde(default)]
258    pub entry: FunctionHash,
259
260    /// Flat instruction array (all functions concatenated).
261    pub instructions: Vec<Instruction>,
262
263    /// Merged constant pool.
264    pub constants: Vec<Constant>,
265
266    /// Merged string pool.
267    pub strings: Vec<String>,
268
269    /// Linked function table (replaces `Vec<Function>`).
270    pub functions: Vec<LinkedFunction>,
271
272    /// Reverse lookup: content hash -> function index in `functions`.
273    pub hash_to_id: HashMap<FunctionHash, usize>,
274
275    /// Debug information.
276    pub debug_info: DebugInfo,
277
278    /// DataFrame schema for column name resolution.
279    pub data_schema: Option<DataFrameSchema>,
280
281    /// Module-binding variable names.
282    pub module_binding_names: Vec<String>,
283
284    /// Number of locals used by top-level code.
285    pub top_level_locals_count: u16,
286
287    /// Storage hints for top-level locals.
288    #[serde(default)]
289    pub top_level_local_storage_hints: Vec<StorageHint>,
290
291    /// Type schema registry for TypedObject field resolution.
292    #[serde(default)]
293    pub type_schema_registry: shape_runtime::type_schema::TypeSchemaRegistry,
294
295    /// Storage hints for module bindings.
296    #[serde(default)]
297    pub module_binding_storage_hints: Vec<StorageHint>,
298
299    /// Per-function local storage hints.
300    #[serde(default)]
301    pub function_local_storage_hints: Vec<Vec<StorageHint>>,
302
303    /// Typed frame layout for top-level locals.
304    #[serde(default)]
305    pub top_level_frame: Option<FrameDescriptor>,
306
307    /// Trait method dispatch registry.
308    pub trait_method_symbols: HashMap<String, String>,
309
310    /// Foreign function metadata table.
311    #[serde(default)]
312    pub foreign_functions: Vec<ForeignFunctionEntry>,
313
314    /// Native `type C` layout metadata table.
315    #[serde(default)]
316    pub native_struct_layouts: Vec<NativeStructLayoutEntry>,
317
318    /// Transitive union of all required permissions across all blobs.
319    /// Computed by the linker during `link()`.
320    #[serde(default = "default_permission_set")]
321    pub total_required_permissions: PermissionSet,
322}