Skip to main content

lua_vm/
dump.rs

1//! Pre-compiled Lua chunk serializer.
2//!
3//! Translates `reference/lua-5.4.7/src/ldump.c` (230 lines, 9 functions + 1 public entry point).
4//! Writes a `LuaProto` to a byte sink in the standard Lua 5.4 bytecode format.
5
6// TODO(port): Adjust import paths once crate boundaries stabilise in Phase B.
7// The types below are expected to resolve as follows:
8//   GcRef        — lua_types (or lua-gc Phase D)
9//   LuaError     — lua_types
10//   LuaProto     — lua-vm (this crate) or lua-types
11//   LuaString    — lua-vm / lua-types
12//   LuaValue     — lua_types
13//   LuaState     — lua-vm (this crate)
14#[allow(unused_imports)]
15use crate::prelude::*;
16use std::mem::size_of;
17
18use crate::state::LuaState;
19use lua_types::proto::LuaProto;
20use lua_types::{GcRef, LuaError, LuaString, LuaValue, LuaVersion};
21
22// ── Constants from lundump.h ─────────────────────────────────────────────────
23
24// dumpLiteral expands to dumpBlock(D, s, sizeof(s) - sizeof(char)).
25// sizeof("\x1bLua") = 5; minus 1 = 4 bytes, no NUL terminator.
26// b"\x1bLua" is &[u8; 4] in Rust — no NUL — so direct use is correct.
27const LUA_SIGNATURE: &[u8] = b"\x1bLua";
28
29// With LUA_VERSION_NUM = 504 (macros.tsv):
30//   (504 / 100) * 16 + 504 % 100 = 5 * 16 + 4 = 84 = 0x54
31const LUA_VERSION_NUM_DUMP_54: i32 = 504;
32const LUAC_VERSION_54: u8 =
33    ((LUA_VERSION_NUM_DUMP_54 / 100) * 16 + LUA_VERSION_NUM_DUMP_54 % 100) as u8;
34const LUAC_VERSION_55: u8 = 0x55;
35
36const LUAC_FORMAT: u8 = 0;
37
38// sizeof("\x19\x93\r\n\x1a\n") = 7; minus 1 = 6 bytes written.
39// b"\x19\x93\r\n\x1a\n" is &[u8; 6].
40const LUAC_DATA: &[u8] = b"\x19\x93\r\n\x1a\n";
41
42const LUAC_INT: i64 = 0x5678;
43
44const LUAC_NUM: f64 = 370.5;
45
46const LUAC_INT_55: i64 = -0x5678;
47
48const LUAC_INST_55: u32 = 0x12345678;
49
50const LUAC_NUM_55: f64 = -370.5;
51
52const INSTRUCTION_SIZE: u8 = size_of::<u32>() as u8;
53
54const LUA_INTEGER_SIZE: u8 = size_of::<i64>() as u8;
55
56const LUA_NUMBER_SIZE: u8 = size_of::<f64>() as u8;
57
58// ── DumpState ────────────────────────────────────────────────────────────────
59
60/// Internal state threaded through every dump operation.
61///
62///
63/// PORT NOTE: `lua_State *L` removed — it was used only for `lua_lock`/`lua_unlock`, which are
64/// no-ops in the default Lua build and dropped here (macros.tsv). `void *data` is folded into
65/// the writer closure. `int status` is replaced by `Result<(), LuaError>` propagated with `?`.
66struct DumpState<'a> {
67    /// Byte-sink callback. C original: `lua_Writer writer` + `void *data` (combined).
68    /// lua_Writer type is TBD in types.tsv; for dump we use a bare byte-slice callback.
69    writer: &'a mut dyn FnMut(&[u8]) -> Result<(), LuaError>,
70    /// When true, strip all debug information from the output.
71    strip: bool,
72    version: LuaVersion,
73}
74
75impl<'a> DumpState<'a> {
76    // ── Low-level write primitives ────────────────────────────────────────────
77
78    /// Write raw bytes to the output stream.
79    ///
80    ///
81    /// PORT NOTE: C accumulates errors in `D->status` and skips subsequent writes once
82    /// non-zero; Rust returns `Result<(), LuaError>` and short-circuits via `?`.
83    /// `lua_lock`/`lua_unlock` are no-ops in the default build and are dropped (macros.tsv).
84    fn dump_block(&mut self, data: &[u8]) -> Result<(), LuaError> {
85        if !data.is_empty() {
86            (self.writer)(data)?;
87        }
88        Ok(())
89    }
90
91    /// Write one byte.
92    ///
93    /// C body: `lu_byte x = (lu_byte)y; dumpVar(D, x);`
94    /// (`dumpVar(D,x)` expands to `dumpVector(D,&x,1)` expands to `dumpBlock(D,&x,sizeof(x))`)
95    fn dump_byte(&mut self, y: u8) -> Result<(), LuaError> {
96        self.dump_block(&[y])
97    }
98
99    /// Write a `size_t` using Lua's variable-length encoding.
100    ///
101    ///
102    /// Encoding (big-endian 7-bit groups, **last** byte marked with MSB = 1):
103    /// - Each byte holds 7 payload bits.
104    /// - Bytes are written most-significant group first.
105    /// - The final byte (least-significant group) has its MSB set as an end marker.
106    ///
107    /// This differs from standard LEB128, which marks the *continuation* bytes rather than
108    /// the terminating byte.
109    ///
110    fn dump_size(&mut self, mut x: usize) -> Result<(), LuaError> {
111        // DIBS = (usize::BITS + 6) / 7; on 64-bit = (64+6)/7 = 10.
112        const DIBS: usize = (usize::BITS as usize + 6) / 7;
113        let mut buff = [0u8; DIBS];
114        let mut n: usize = 0;
115
116        loop {
117            n += 1;
118            buff[DIBS - n] = (x & 0x7f) as u8; // fill buffer in reverse order
119            x >>= 7;
120            if x == 0 {
121                break;
122            }
123        }
124
125        // The byte at buff[DIBS-1] is the first byte placed (least-significant group).
126        // Setting its MSB marks it as the terminal byte of the encoding.
127        buff[DIBS - 1] |= 0x80;
128
129        self.dump_block(&buff[DIBS - n..])
130    }
131
132    /// Write an `int` as a variable-length size.
133    ///
134    ///
135    /// PORT NOTE: C implicitly casts `int` → `size_t`. All call sites pass non-negative values
136    /// (line numbers, instruction counts, vector lengths); a debug assertion guards this.
137    fn dump_int(&mut self, x: i32) -> Result<(), LuaError> {
138        debug_assert!(
139            x >= 0,
140            "dump_int: negative value {} cast to usize would wrap",
141            x
142        );
143        self.dump_size(x as usize)
144    }
145
146    /// Write a `lua_Number` (f64) in the platform's native byte order.
147    ///
148    ///
149    /// `dumpVar(D,x)` expands to `dumpBlock(D, &x, sizeof(lua_Number))` — 8 bytes, native order.
150    /// `to_ne_bytes()` replicates native-endian serialisation. The bytecode header's `LUAC_NUM`
151    /// sentinel (370.5) lets `lundump` detect byte-order mismatches at load time.
152    fn dump_number(&mut self, x: f64) -> Result<(), LuaError> {
153        self.dump_block(&x.to_ne_bytes())
154    }
155
156    /// Write a `lua_Integer` (i64) in the platform's native byte order.
157    ///
158    fn dump_integer(&mut self, x: i64) -> Result<(), LuaError> {
159        self.dump_block(&x.to_ne_bytes())
160    }
161
162    fn dump_raw_i32(&mut self, x: i32) -> Result<(), LuaError> {
163        self.dump_block(&x.to_ne_bytes())
164    }
165
166    fn dump_raw_u32(&mut self, x: u32) -> Result<(), LuaError> {
167        self.dump_block(&x.to_ne_bytes())
168    }
169
170    // ── Mid-level serialisers ─────────────────────────────────────────────────
171
172    /// Write an interned or long string, or a null sentinel (encoded size = 0).
173    ///
174    ///
175    /// Encoding: `dumpSize(len + 1)` followed by `len` raw bytes; size 0 means null/absent.
176    /// `tsslen(s)` → `s.len()` and `getstr(s)` → `s.as_bytes()` (macros.tsv).
177    fn dump_string(&mut self, s: Option<&GcRef<LuaString>>) -> Result<(), LuaError> {
178        match s {
179            None => self.dump_size(0),
180
181            Some(s) => {
182                let bytes = s.as_bytes(); // tsslen → .len(); getstr → .as_bytes()
183                self.dump_size(bytes.len() + 1)?;
184                self.dump_block(bytes)
185            }
186        }
187    }
188
189    /// Write the bytecode instruction array.
190    ///
191    ///
192    /// PORT NOTE: `f->sizecode` is covered by `Vec::len()` (types.tsv).
193    fn dump_code(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
194        self.dump_int(proto.code.len() as i32)?;
195
196        // dumpVector writes n * sizeof(Instruction) = n * 4 bytes in native byte order.
197        for instr in &proto.code {
198            // TODO(port): `Instruction` is a u32 newtype (types.tsv). Accessing the inner u32
199            // via `.0` assumes a tuple-struct layout. If the Instruction API differs (e.g.,
200            // exposes `.raw()` or `u32::from(*instr)`), adjust accordingly in Phase B.
201            self.dump_block(&instr.0.to_ne_bytes())?;
202        }
203        Ok(())
204    }
205
206    /// Write the constant pool.
207    ///
208    ///
209    /// Each constant is written as: one tag byte (`ttypetag`), followed by the payload
210    /// (float: 8 bytes; integer: 8 bytes; string: variable-length; nil/bool: nothing).
211    ///
212    /// PORT NOTE: `f->sizek` is covered by `Vec::len()` (types.tsv).
213    fn dump_constants(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
214        let n = proto.k.len();
215        self.dump_int(n as i32)?;
216
217        for constant in &proto.k {
218            // ttypetag(o) → o.full_type_tag() (macros.tsv)
219            // Returns the C-side tag byte: bits 0-3 base type, bits 4-5 variant, bit 6 collectable.
220            let tag = constant.full_type_tag();
221            self.dump_byte(tag)?;
222
223            match constant {
224                LuaValue::Float(f) => {
225                    // fltvalue(o) → o.as_float().expect("not float") or `if let` (macros.tsv)
226                    self.dump_number(*f)?;
227                }
228                LuaValue::Int(i) => {
229                    self.dump_integer(*i)?;
230                }
231                LuaValue::Str(s) => {
232                    // tsvalue(o) → o.as_string().expect("not string") (macros.tsv)
233                    self.dump_string(Some(s))?;
234                }
235                LuaValue::Nil | LuaValue::Bool(_) => {
236                    // Only the tag byte is written; nil and booleans carry no additional payload.
237                    // lua_assert → debug_assert! (macros.tsv)
238                    debug_assert!(
239                        matches!(constant, LuaValue::Nil | LuaValue::Bool(_)),
240                        "dump_constants: default branch reached for unexpected variant"
241                    );
242                }
243                _ => {
244                    // TODO(port): LuaValue variant not valid as a constant-pool entry.
245                    // In C the default branch asserts nil/false/true only. Any other variant
246                    // here indicates a malformed proto; flag for Phase B investigation.
247                    debug_assert!(
248                        false,
249                        "dump_constants: unexpected LuaValue variant in constant pool"
250                    );
251                }
252            }
253        }
254        Ok(())
255    }
256
257    /// Write nested function prototypes (sub-functions defined inside `proto`).
258    ///
259    ///
260    /// PORT NOTE: `f->sizep` is covered by `Vec::len()` (types.tsv).
261    /// The parent's source string is passed down so that children with identical source
262    /// origins can omit the redundant source name (see `dump_function`).
263    fn dump_protos(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
264        let n = proto.p.len();
265        self.dump_int(n as i32)?;
266
267        for sub in &proto.p {
268            // sub: &GcRef<LuaProto>; deref coercion (&GcRef<LuaProto> → &LuaProto) expected
269            // when GcRef<T>: Deref<Target=T> (true for Rc<T> in Phase A).
270            self.dump_function(sub, proto.source.as_ref())?;
271        }
272        Ok(())
273    }
274
275    /// Write upvalue descriptors (instack / idx / kind for each upvalue slot).
276    ///
277    ///
278    /// PORT NOTE: `f->sizeupvalues` is covered by `Vec::len()` (types.tsv).
279    /// `Upvaldesc.instack` is `bool` in Rust (types.tsv); cast to `u8` for the wire format.
280    fn dump_upvalues(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
281        let n = proto.upvalues.len();
282        self.dump_int(n as i32)?;
283
284        for upval in &proto.upvalues {
285            // PORT NOTE: instack is bool in Rust (types.tsv); cast to u8: true→1, false→0.
286            self.dump_byte(upval.instack as u8)?;
287            self.dump_byte(upval.idx)?;
288            self.dump_byte(upval.kind)?;
289        }
290        Ok(())
291    }
292
293    /// Write debug information: per-instruction line deltas, absolute line records,
294    /// local-variable lifetimes, and upvalue names.
295    ///
296    /// All counts are written as zero when `self.strip` is true.
297    ///
298    ///
299    /// PORT NOTE: all `f->size*` fields are covered by `Vec::len()` (types.tsv).
300    fn dump_debug(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
301        let n_lineinfo = if self.strip { 0 } else { proto.lineinfo.len() };
302        self.dump_int(n_lineinfo as i32)?;
303
304        // lineinfo is Vec<i8> (ls_byte per types.tsv). C writes them as raw bytes (sizeof(i8)=1).
305        // Cast each i8 to u8 (same bit pattern) before writing.
306        // PERF(port): iterating one byte at a time vs. bulk write — profile in Phase B.
307        // (A bulk write would require bytemuck::cast_slice or similar to avoid unsafe.)
308        let lineinfo_bytes: Vec<u8> = proto.lineinfo[..n_lineinfo]
309            .iter()
310            .map(|&b| b as u8)
311            .collect();
312        self.dump_block(&lineinfo_bytes)?;
313
314        let n_absline = if self.strip {
315            0
316        } else {
317            proto.abslineinfo.len()
318        };
319        self.dump_int(n_absline as i32)?;
320
321        for abs in proto.abslineinfo.iter().take(n_absline) {
322            // AbsLineInfo.pc and .line are i32 (types.tsv); non-negative in valid bytecode.
323            self.dump_int(abs.pc)?;
324            self.dump_int(abs.line)?;
325        }
326
327        let n_locvars = if self.strip { 0 } else { proto.locvars.len() };
328        self.dump_int(n_locvars as i32)?;
329
330        for locvar in proto.locvars.iter().take(n_locvars) {
331            // LocVar.varname is GcRef<LuaString> (types.tsv).
332            self.dump_string(Some(&locvar.varname))?;
333            self.dump_int(locvar.startpc)?;
334            self.dump_int(locvar.endpc)?;
335        }
336
337        // (Re-uses upvalues.len() for the name-writing pass — separate from dumpUpvalues
338        //  which wrote structural descriptors; here we write debug names.)
339        let n_upval_names = if self.strip { 0 } else { proto.upvalues.len() };
340        self.dump_int(n_upval_names as i32)?;
341
342        for upval in proto.upvalues.iter().take(n_upval_names) {
343            // PORT NOTE: UpvalDesc.name is GcRef<LuaString> per types.tsv (non-optional).
344            // TODO(port): In C, `TString *name` can be NULL when an upvalue is unnamed (e.g.,
345            // in bytecode compiled without debug info). Verify whether UpvalDesc.name should be
346            // `Option<GcRef<LuaString>>` in the Rust model; if so, change call to pass the Option
347            // directly instead of wrapping in Some.
348            self.dump_string(upval.name.as_ref())?;
349        }
350        Ok(())
351    }
352
353    /// Write a complete function prototype: source name, header bytes, code, constants,
354    /// upvalue descriptors, nested prototypes, and debug information.
355    ///
356    /// `psource` is the parent function's source string. When `f->source == psource` (pointer
357    /// equality — Lua interns short strings so identical source names share an object), the
358    /// source is written as null (size 0) to avoid duplication. The top-level call passes
359    /// `None` to force writing the source.
360    ///
361    ///
362    /// PORT NOTE: `f->source == psource` is a C pointer comparison exploiting string interning.
363    /// In Rust we use `GcRef::ptr_eq` (equivalent to `Rc::ptr_eq` in Phase A) for identity.
364    /// `is_vararg` is `bool` in Rust (types.tsv); cast to `u8` for the wire format.
365    fn dump_function(
366        &mut self,
367        proto: &LuaProto,
368        psource: Option<&GcRef<LuaString>>,
369    ) -> Result<(), LuaError> {
370        // Pointer-equality check: same interned string object means same source file.
371        let same_source = match (psource, proto.source.as_ref()) {
372            (Some(ps), Some(src)) => GcRef::ptr_eq(src, ps),
373            _ => false,
374        };
375
376        if self.strip || same_source {
377            self.dump_string(None)?;
378        } else {
379            self.dump_string(proto.source.as_ref())?;
380        }
381
382        self.dump_int(proto.linedefined)?;
383        self.dump_int(proto.lastlinedefined)?;
384        self.dump_byte(proto.numparams)?;
385        // PORT NOTE: is_vararg is bool in Rust (types.tsv); true → 1u8, false → 0u8.
386        self.dump_byte(proto.is_vararg as u8)?;
387        self.dump_byte(proto.maxstacksize)?;
388
389        self.dump_code(proto)?;
390        self.dump_constants(proto)?;
391        self.dump_upvalues(proto)?;
392        self.dump_protos(proto)?;
393        self.dump_debug(proto)?;
394        Ok(())
395    }
396
397    /// Write the binary chunk header.
398    ///
399    /// The header allows `lundump` (and external tools) to verify the bytecode format,
400    /// platform word sizes, and byte order before attempting to load the chunk.
401    ///
402    fn dump_header(&mut self) -> Result<(), LuaError> {
403        // dumpLiteral(D,s) = dumpBlock(D, s, sizeof(s) - sizeof(char))
404        // b"\x1bLua" is &[u8; 4] (no NUL terminator in Rust byte literals), matching the
405        // C expansion of sizeof("\x1bLua")-1 = 4 bytes.
406        self.dump_block(LUA_SIGNATURE)?;
407
408        self.dump_byte(if matches!(self.version, LuaVersion::V55) {
409            LUAC_VERSION_55
410        } else {
411            LUAC_VERSION_54
412        })?;
413
414        self.dump_byte(LUAC_FORMAT)?;
415
416        // b"\x19\x93\r\n\x1a\n" is &[u8; 6], matching sizeof(LUAC_DATA)-1 = 6 bytes.
417        self.dump_block(LUAC_DATA)?;
418
419        if matches!(self.version, LuaVersion::V55) {
420            self.dump_byte(size_of::<i32>() as u8)?;
421            self.dump_raw_i32(LUAC_INT_55 as i32)?;
422
423            self.dump_byte(INSTRUCTION_SIZE)?;
424            self.dump_raw_u32(LUAC_INST_55)?;
425
426            self.dump_byte(LUA_INTEGER_SIZE)?;
427            self.dump_integer(LUAC_INT_55)?;
428
429            self.dump_byte(LUA_NUMBER_SIZE)?;
430            self.dump_number(LUAC_NUM_55)?;
431        } else {
432            self.dump_byte(INSTRUCTION_SIZE)?;
433
434            self.dump_byte(LUA_INTEGER_SIZE)?;
435
436            self.dump_byte(LUA_NUMBER_SIZE)?;
437
438            self.dump_integer(LUAC_INT)?;
439
440            self.dump_number(LUAC_NUM)?;
441        }
442
443        Ok(())
444    }
445}
446
447// ── Public entry point ───────────────────────────────────────────────────────
448
449/// Serialize a compiled Lua function prototype as a precompiled bytecode chunk.
450///
451/// The `writer` callback receives successive slices of the serialised bytes and returns
452/// `Err(LuaError)` to abort. `strip` omits debug info (line numbers, local names, etc.)
453/// from the output.
454///
455///
456/// PORT NOTE: `lua_Writer w` (fn pointer) + `void *data` (userdata) are collapsed into a
457/// single `impl FnMut(&[u8]) -> Result<(), LuaError>` closure — the Rust idiom for the
458/// callback + context pair. `_state` is retained in the signature for API parity but unused
459/// in the body: the C code needed it only for `lua_lock`/`lua_unlock`, which are no-ops per
460/// macros.tsv. Return type changes from `int` (0 = ok, non-zero = writer error) to
461/// `Result<(), LuaError>`.
462pub(crate) fn dump(
463    state: &LuaState,
464    proto: &GcRef<LuaProto>,
465    writer: &mut dyn FnMut(&[u8]) -> Result<(), LuaError>,
466    strip: bool,
467) -> Result<(), LuaError> {
468    let mut d = DumpState {
469        writer,
470        strip,
471        version: state.global().lua_version,
472    };
473
474    d.dump_header()?;
475
476    // PORT NOTE: f->sizeupvalues is covered by Vec::len(). Bounded by MAXUPVAL = 255
477    // (macros.tsv), so truncation via `as u8` is safe for well-formed prototypes.
478    d.dump_byte(proto.upvalues.len() as u8)?;
479
480    // psource = None forces the top-level function to always write its source name.
481    // Deref coercion: &GcRef<LuaProto> → &LuaProto (via Deref<Target=LuaProto> on GcRef/Rc).
482    d.dump_function(proto, None)?;
483
484    Ok(())
485}
486
487// ────────────────────────────────────────────────────────────────────────────
488// PORT STATUS
489//   source:        src/ldump.c  (230 lines, 10 functions)
490//   target_crate:  lua-vm
491//   confidence:    medium
492//   todos:         4
493//   port_notes:    12
494//   unsafe_blocks: 0
495//   notes:         Types/imports need Phase B wiring; logic should be faithful.
496//                  Key uncertainties: (1) Instruction newtype inner-field access (.0 vs
497//                  method); (2) UpvalDesc.name optionality; (3) GcRef::ptr_eq method
498//                  existence. Lineinfo bulk-write is done via collect()+dump_block to
499//                  avoid unsafe transmute of &[i8] → &[u8]; revisit with bytemuck in
500//                  Phase B for performance. Native-endian serialisation via to_ne_bytes()
501//                  matches C's raw-memory dumpVector behaviour.
502// ────────────────────────────────────────────────────────────────────────────