Skip to main content

lua_vm/
dump.rs

1//! Pre-compiled Lua chunk serializer.
2//!
3//! Translates `reference/lua-5.4.7/src/ldump.c` (230 lines, 9 functions + 1 public entry point).
4//! Writes a `LuaProto` to a byte sink in the standard Lua 5.4 bytecode format.
5
6
7// TODO(port): Adjust import paths once crate boundaries stabilise in Phase B.
8// The types below are expected to resolve as follows:
9//   GcRef        — lua_types (or lua-gc Phase D)
10//   LuaError     — lua_types
11//   LuaProto     — lua-vm (this crate) or lua-types
12//   LuaString    — lua-vm / lua-types
13//   LuaValue     — lua_types
14//   LuaState     — lua-vm (this crate)
15use std::mem::size_of;
16#[allow(unused_imports)] use crate::prelude::*;
17
18use crate::state::LuaState;
19use lua_types::{GcRef, LuaError, LuaString, LuaValue};
20use lua_types::proto::LuaProto;
21
22// ── Constants from lundump.h ─────────────────────────────────────────────────
23
24// dumpLiteral expands to dumpBlock(D, s, sizeof(s) - sizeof(char)).
25// sizeof("\x1bLua") = 5; minus 1 = 4 bytes, no NUL terminator.
26// b"\x1bLua" is &[u8; 4] in Rust — no NUL — so direct use is correct.
27const LUA_SIGNATURE: &[u8] = b"\x1bLua";
28
29// With LUA_VERSION_NUM = 504 (macros.tsv):
30//   (504 / 100) * 16 + 504 % 100 = 5 * 16 + 4 = 84 = 0x54
31const LUA_VERSION_NUM_DUMP: i32 = 504;
32const LUAC_VERSION: u8 =
33    ((LUA_VERSION_NUM_DUMP / 100) * 16 + LUA_VERSION_NUM_DUMP % 100) as u8;
34
35const LUAC_FORMAT: u8 = 0;
36
37// sizeof("\x19\x93\r\n\x1a\n") = 7; minus 1 = 6 bytes written.
38// b"\x19\x93\r\n\x1a\n" is &[u8; 6].
39const LUAC_DATA: &[u8] = b"\x19\x93\r\n\x1a\n";
40
41const LUAC_INT: i64 = 0x5678;
42
43const LUAC_NUM: f64 = 370.5;
44
45const INSTRUCTION_SIZE: u8 = size_of::<u32>() as u8;
46
47const LUA_INTEGER_SIZE: u8 = size_of::<i64>() as u8;
48
49const LUA_NUMBER_SIZE: u8 = size_of::<f64>() as u8;
50
51// ── DumpState ────────────────────────────────────────────────────────────────
52
53/// Internal state threaded through every dump operation.
54///
55///
56/// PORT NOTE: `lua_State *L` removed — it was used only for `lua_lock`/`lua_unlock`, which are
57/// no-ops in the default Lua build and dropped here (macros.tsv). `void *data` is folded into
58/// the writer closure. `int status` is replaced by `Result<(), LuaError>` propagated with `?`.
59struct DumpState<'a> {
60    /// Byte-sink callback. C original: `lua_Writer writer` + `void *data` (combined).
61    /// lua_Writer type is TBD in types.tsv; for dump we use a bare byte-slice callback.
62    writer: &'a mut dyn FnMut(&[u8]) -> Result<(), LuaError>,
63    /// When true, strip all debug information from the output.
64    strip: bool,
65}
66
67impl<'a> DumpState<'a> {
68    // ── Low-level write primitives ────────────────────────────────────────────
69
70    /// Write raw bytes to the output stream.
71    ///
72    ///
73    /// PORT NOTE: C accumulates errors in `D->status` and skips subsequent writes once
74    /// non-zero; Rust returns `Result<(), LuaError>` and short-circuits via `?`.
75    /// `lua_lock`/`lua_unlock` are no-ops in the default build and are dropped (macros.tsv).
76    fn dump_block(&mut self, data: &[u8]) -> Result<(), LuaError> {
77        if !data.is_empty() {
78            (self.writer)(data)?;
79        }
80        Ok(())
81    }
82
83    /// Write one byte.
84    ///
85    /// C body: `lu_byte x = (lu_byte)y; dumpVar(D, x);`
86    /// (`dumpVar(D,x)` expands to `dumpVector(D,&x,1)` expands to `dumpBlock(D,&x,sizeof(x))`)
87    fn dump_byte(&mut self, y: u8) -> Result<(), LuaError> {
88        self.dump_block(&[y])
89    }
90
91    /// Write a `size_t` using Lua's variable-length encoding.
92    ///
93    ///
94    /// Encoding (big-endian 7-bit groups, **last** byte marked with MSB = 1):
95    /// - Each byte holds 7 payload bits.
96    /// - Bytes are written most-significant group first.
97    /// - The final byte (least-significant group) has its MSB set as an end marker.
98    ///
99    /// This differs from standard LEB128, which marks the *continuation* bytes rather than
100    /// the terminating byte.
101    ///
102    fn dump_size(&mut self, mut x: usize) -> Result<(), LuaError> {
103        // DIBS = (usize::BITS + 6) / 7; on 64-bit = (64+6)/7 = 10.
104        const DIBS: usize = (usize::BITS as usize + 6) / 7;
105        let mut buff = [0u8; DIBS];
106        let mut n: usize = 0;
107
108        loop {
109            n += 1;
110            buff[DIBS - n] = (x & 0x7f) as u8; // fill buffer in reverse order
111            x >>= 7;
112            if x == 0 {
113                break;
114            }
115        }
116
117        // The byte at buff[DIBS-1] is the first byte placed (least-significant group).
118        // Setting its MSB marks it as the terminal byte of the encoding.
119        buff[DIBS - 1] |= 0x80;
120
121        self.dump_block(&buff[DIBS - n..])
122    }
123
124    /// Write an `int` as a variable-length size.
125    ///
126    ///
127    /// PORT NOTE: C implicitly casts `int` → `size_t`. All call sites pass non-negative values
128    /// (line numbers, instruction counts, vector lengths); a debug assertion guards this.
129    fn dump_int(&mut self, x: i32) -> Result<(), LuaError> {
130        debug_assert!(
131            x >= 0,
132            "dump_int: negative value {} cast to usize would wrap",
133            x
134        );
135        self.dump_size(x as usize)
136    }
137
138    /// Write a `lua_Number` (f64) in the platform's native byte order.
139    ///
140    ///
141    /// `dumpVar(D,x)` expands to `dumpBlock(D, &x, sizeof(lua_Number))` — 8 bytes, native order.
142    /// `to_ne_bytes()` replicates native-endian serialisation. The bytecode header's `LUAC_NUM`
143    /// sentinel (370.5) lets `lundump` detect byte-order mismatches at load time.
144    fn dump_number(&mut self, x: f64) -> Result<(), LuaError> {
145        self.dump_block(&x.to_ne_bytes())
146    }
147
148    /// Write a `lua_Integer` (i64) in the platform's native byte order.
149    ///
150    fn dump_integer(&mut self, x: i64) -> Result<(), LuaError> {
151        self.dump_block(&x.to_ne_bytes())
152    }
153
154    // ── Mid-level serialisers ─────────────────────────────────────────────────
155
156    /// Write an interned or long string, or a null sentinel (encoded size = 0).
157    ///
158    ///
159    /// Encoding: `dumpSize(len + 1)` followed by `len` raw bytes; size 0 means null/absent.
160    /// `tsslen(s)` → `s.len()` and `getstr(s)` → `s.as_bytes()` (macros.tsv).
161    fn dump_string(&mut self, s: Option<&GcRef<LuaString>>) -> Result<(), LuaError> {
162        match s {
163            None => self.dump_size(0),
164
165            Some(s) => {
166                let bytes = s.as_bytes(); // tsslen → .len(); getstr → .as_bytes()
167                self.dump_size(bytes.len() + 1)?;
168                self.dump_block(bytes)
169            }
170        }
171    }
172
173    /// Write the bytecode instruction array.
174    ///
175    ///
176    /// PORT NOTE: `f->sizecode` is covered by `Vec::len()` (types.tsv).
177    fn dump_code(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
178        self.dump_int(proto.code.len() as i32)?;
179
180        // dumpVector writes n * sizeof(Instruction) = n * 4 bytes in native byte order.
181        for instr in &proto.code {
182            // TODO(port): `Instruction` is a u32 newtype (types.tsv). Accessing the inner u32
183            // via `.0` assumes a tuple-struct layout. If the Instruction API differs (e.g.,
184            // exposes `.raw()` or `u32::from(*instr)`), adjust accordingly in Phase B.
185            self.dump_block(&instr.0.to_ne_bytes())?;
186        }
187        Ok(())
188    }
189
190    /// Write the constant pool.
191    ///
192    ///
193    /// Each constant is written as: one tag byte (`ttypetag`), followed by the payload
194    /// (float: 8 bytes; integer: 8 bytes; string: variable-length; nil/bool: nothing).
195    ///
196    /// PORT NOTE: `f->sizek` is covered by `Vec::len()` (types.tsv).
197    fn dump_constants(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
198        let n = proto.k.len();
199        self.dump_int(n as i32)?;
200
201        for constant in &proto.k {
202            // ttypetag(o) → o.full_type_tag() (macros.tsv)
203            // Returns the C-side tag byte: bits 0-3 base type, bits 4-5 variant, bit 6 collectable.
204            let tag = constant.full_type_tag();
205            self.dump_byte(tag)?;
206
207            match constant {
208                LuaValue::Float(f) => {
209                    // fltvalue(o) → o.as_float().expect("not float") or `if let` (macros.tsv)
210                    self.dump_number(*f)?;
211                }
212                LuaValue::Int(i) => {
213                    self.dump_integer(*i)?;
214                }
215                LuaValue::Str(s) => {
216                    // tsvalue(o) → o.as_string().expect("not string") (macros.tsv)
217                    self.dump_string(Some(s))?;
218                }
219                LuaValue::Nil | LuaValue::Bool(_) => {
220                    // Only the tag byte is written; nil and booleans carry no additional payload.
221                    // lua_assert → debug_assert! (macros.tsv)
222                    debug_assert!(
223                        matches!(constant, LuaValue::Nil | LuaValue::Bool(_)),
224                        "dump_constants: default branch reached for unexpected variant"
225                    );
226                }
227                _ => {
228                    // TODO(port): LuaValue variant not valid as a constant-pool entry.
229                    // In C the default branch asserts nil/false/true only. Any other variant
230                    // here indicates a malformed proto; flag for Phase B investigation.
231                    debug_assert!(false, "dump_constants: unexpected LuaValue variant in constant pool");
232                }
233            }
234        }
235        Ok(())
236    }
237
238    /// Write nested function prototypes (sub-functions defined inside `proto`).
239    ///
240    ///
241    /// PORT NOTE: `f->sizep` is covered by `Vec::len()` (types.tsv).
242    /// The parent's source string is passed down so that children with identical source
243    /// origins can omit the redundant source name (see `dump_function`).
244    fn dump_protos(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
245        let n = proto.p.len();
246        self.dump_int(n as i32)?;
247
248        for sub in &proto.p {
249            // sub: &GcRef<LuaProto>; deref coercion (&GcRef<LuaProto> → &LuaProto) expected
250            // when GcRef<T>: Deref<Target=T> (true for Rc<T> in Phase A).
251            self.dump_function(sub, proto.source.as_ref())?;
252        }
253        Ok(())
254    }
255
256    /// Write upvalue descriptors (instack / idx / kind for each upvalue slot).
257    ///
258    ///
259    /// PORT NOTE: `f->sizeupvalues` is covered by `Vec::len()` (types.tsv).
260    /// `Upvaldesc.instack` is `bool` in Rust (types.tsv); cast to `u8` for the wire format.
261    fn dump_upvalues(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
262        let n = proto.upvalues.len();
263        self.dump_int(n as i32)?;
264
265        for upval in &proto.upvalues {
266            // PORT NOTE: instack is bool in Rust (types.tsv); cast to u8: true→1, false→0.
267            self.dump_byte(upval.instack as u8)?;
268            self.dump_byte(upval.idx)?;
269            self.dump_byte(upval.kind)?;
270        }
271        Ok(())
272    }
273
274    /// Write debug information: per-instruction line deltas, absolute line records,
275    /// local-variable lifetimes, and upvalue names.
276    ///
277    /// All counts are written as zero when `self.strip` is true.
278    ///
279    ///
280    /// PORT NOTE: all `f->size*` fields are covered by `Vec::len()` (types.tsv).
281    fn dump_debug(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
282        let n_lineinfo = if self.strip { 0 } else { proto.lineinfo.len() };
283        self.dump_int(n_lineinfo as i32)?;
284
285        // lineinfo is Vec<i8> (ls_byte per types.tsv). C writes them as raw bytes (sizeof(i8)=1).
286        // Cast each i8 to u8 (same bit pattern) before writing.
287        // PERF(port): iterating one byte at a time vs. bulk write — profile in Phase B.
288        // (A bulk write would require bytemuck::cast_slice or similar to avoid unsafe.)
289        let lineinfo_bytes: Vec<u8> = proto.lineinfo[..n_lineinfo]
290            .iter()
291            .map(|&b| b as u8)
292            .collect();
293        self.dump_block(&lineinfo_bytes)?;
294
295        let n_absline = if self.strip { 0 } else { proto.abslineinfo.len() };
296        self.dump_int(n_absline as i32)?;
297
298        for abs in proto.abslineinfo.iter().take(n_absline) {
299            // AbsLineInfo.pc and .line are i32 (types.tsv); non-negative in valid bytecode.
300            self.dump_int(abs.pc)?;
301            self.dump_int(abs.line)?;
302        }
303
304        let n_locvars = if self.strip { 0 } else { proto.locvars.len() };
305        self.dump_int(n_locvars as i32)?;
306
307        for locvar in proto.locvars.iter().take(n_locvars) {
308            // LocVar.varname is GcRef<LuaString> (types.tsv).
309            self.dump_string(Some(&locvar.varname))?;
310            self.dump_int(locvar.startpc)?;
311            self.dump_int(locvar.endpc)?;
312        }
313
314        // (Re-uses upvalues.len() for the name-writing pass — separate from dumpUpvalues
315        //  which wrote structural descriptors; here we write debug names.)
316        let n_upval_names = if self.strip { 0 } else { proto.upvalues.len() };
317        self.dump_int(n_upval_names as i32)?;
318
319        for upval in proto.upvalues.iter().take(n_upval_names) {
320            // PORT NOTE: UpvalDesc.name is GcRef<LuaString> per types.tsv (non-optional).
321            // TODO(port): In C, `TString *name` can be NULL when an upvalue is unnamed (e.g.,
322            // in bytecode compiled without debug info). Verify whether UpvalDesc.name should be
323            // `Option<GcRef<LuaString>>` in the Rust model; if so, change call to pass the Option
324            // directly instead of wrapping in Some.
325            self.dump_string(upval.name.as_ref())?;
326        }
327        Ok(())
328    }
329
330    /// Write a complete function prototype: source name, header bytes, code, constants,
331    /// upvalue descriptors, nested prototypes, and debug information.
332    ///
333    /// `psource` is the parent function's source string. When `f->source == psource` (pointer
334    /// equality — Lua interns short strings so identical source names share an object), the
335    /// source is written as null (size 0) to avoid duplication. The top-level call passes
336    /// `None` to force writing the source.
337    ///
338    ///
339    /// PORT NOTE: `f->source == psource` is a C pointer comparison exploiting string interning.
340    /// In Rust we use `GcRef::ptr_eq` (equivalent to `Rc::ptr_eq` in Phase A) for identity.
341    /// `is_vararg` is `bool` in Rust (types.tsv); cast to `u8` for the wire format.
342    fn dump_function(
343        &mut self,
344        proto: &LuaProto,
345        psource: Option<&GcRef<LuaString>>,
346    ) -> Result<(), LuaError> {
347        // Pointer-equality check: same interned string object means same source file.
348        let same_source = match (psource, proto.source.as_ref()) {
349            (Some(ps), Some(src)) => GcRef::ptr_eq(src, ps),
350            _ => false,
351        };
352
353        if self.strip || same_source {
354            self.dump_string(None)?;
355        } else {
356            self.dump_string(proto.source.as_ref())?;
357        }
358
359        self.dump_int(proto.linedefined)?;
360        self.dump_int(proto.lastlinedefined)?;
361        self.dump_byte(proto.numparams)?;
362        // PORT NOTE: is_vararg is bool in Rust (types.tsv); true → 1u8, false → 0u8.
363        self.dump_byte(proto.is_vararg as u8)?;
364        self.dump_byte(proto.maxstacksize)?;
365
366        self.dump_code(proto)?;
367        self.dump_constants(proto)?;
368        self.dump_upvalues(proto)?;
369        self.dump_protos(proto)?;
370        self.dump_debug(proto)?;
371        Ok(())
372    }
373
374    /// Write the binary chunk header.
375    ///
376    /// The header allows `lundump` (and external tools) to verify the bytecode format,
377    /// platform word sizes, and byte order before attempting to load the chunk.
378    ///
379    fn dump_header(&mut self) -> Result<(), LuaError> {
380        // dumpLiteral(D,s) = dumpBlock(D, s, sizeof(s) - sizeof(char))
381        // b"\x1bLua" is &[u8; 4] (no NUL terminator in Rust byte literals), matching the
382        // C expansion of sizeof("\x1bLua")-1 = 4 bytes.
383        self.dump_block(LUA_SIGNATURE)?;
384
385        self.dump_byte(LUAC_VERSION)?;
386
387        self.dump_byte(LUAC_FORMAT)?;
388
389        // b"\x19\x93\r\n\x1a\n" is &[u8; 6], matching sizeof(LUAC_DATA)-1 = 6 bytes.
390        self.dump_block(LUAC_DATA)?;
391
392        self.dump_byte(INSTRUCTION_SIZE)?;
393
394        self.dump_byte(LUA_INTEGER_SIZE)?;
395
396        self.dump_byte(LUA_NUMBER_SIZE)?;
397
398        self.dump_integer(LUAC_INT)?;
399
400        self.dump_number(LUAC_NUM)?;
401
402        Ok(())
403    }
404}
405
406// ── Public entry point ───────────────────────────────────────────────────────
407
408/// Serialize a compiled Lua function prototype as a precompiled bytecode chunk.
409///
410/// The `writer` callback receives successive slices of the serialised bytes and returns
411/// `Err(LuaError)` to abort. `strip` omits debug info (line numbers, local names, etc.)
412/// from the output.
413///
414///
415/// PORT NOTE: `lua_Writer w` (fn pointer) + `void *data` (userdata) are collapsed into a
416/// single `impl FnMut(&[u8]) -> Result<(), LuaError>` closure — the Rust idiom for the
417/// callback + context pair. `_state` is retained in the signature for API parity but unused
418/// in the body: the C code needed it only for `lua_lock`/`lua_unlock`, which are no-ops per
419/// macros.tsv. Return type changes from `int` (0 = ok, non-zero = writer error) to
420/// `Result<(), LuaError>`.
421pub(crate) fn dump(
422    _state: &LuaState,
423    proto: &GcRef<LuaProto>,
424    writer: &mut dyn FnMut(&[u8]) -> Result<(), LuaError>,
425    strip: bool,
426) -> Result<(), LuaError> {
427    let mut d = DumpState {
428        writer,
429        strip,
430    };
431
432    d.dump_header()?;
433
434    // PORT NOTE: f->sizeupvalues is covered by Vec::len(). Bounded by MAXUPVAL = 255
435    // (macros.tsv), so truncation via `as u8` is safe for well-formed prototypes.
436    d.dump_byte(proto.upvalues.len() as u8)?;
437
438    // psource = None forces the top-level function to always write its source name.
439    // Deref coercion: &GcRef<LuaProto> → &LuaProto (via Deref<Target=LuaProto> on GcRef/Rc).
440    d.dump_function(proto, None)?;
441
442    Ok(())
443}
444
445// ────────────────────────────────────────────────────────────────────────────
446// PORT STATUS
447//   source:        src/ldump.c  (230 lines, 10 functions)
448//   target_crate:  lua-vm
449//   confidence:    medium
450//   todos:         4
451//   port_notes:    12
452//   unsafe_blocks: 0
453//   notes:         Types/imports need Phase B wiring; logic should be faithful.
454//                  Key uncertainties: (1) Instruction newtype inner-field access (.0 vs
455//                  method); (2) UpvalDesc.name optionality; (3) GcRef::ptr_eq method
456//                  existence. Lineinfo bulk-write is done via collect()+dump_block to
457//                  avoid unsafe transmute of &[i8] → &[u8]; revisit with bytemuck in
458//                  Phase B for performance. Native-endian serialisation via to_ne_bytes()
459//                  matches C's raw-memory dumpVector behaviour.
460// ────────────────────────────────────────────────────────────────────────────