lua_vm/dump.rs
1//! Pre-compiled Lua chunk serializer.
2//!
3//! Translates `reference/lua-5.4.7/src/ldump.c` (230 lines, 9 functions + 1 public entry point).
4//! Writes a `LuaProto` to a byte sink in the standard Lua 5.4 bytecode format.
5
6// TODO(port): Adjust import paths once crate boundaries stabilise in Phase B.
7// The types below are expected to resolve as follows:
8// GcRef — lua_types (or lua-gc Phase D)
9// LuaError — lua_types
10// LuaProto — lua-vm (this crate) or lua-types
11// LuaString — lua-vm / lua-types
12// LuaValue — lua_types
13// LuaState — lua-vm (this crate)
14#[allow(unused_imports)]
15use crate::prelude::*;
16use std::mem::size_of;
17
18use crate::state::LuaState;
19use lua_types::proto::LuaProto;
20use lua_types::{GcRef, LuaError, LuaString, LuaValue, LuaVersion};
21
22// ── Constants from lundump.h ─────────────────────────────────────────────────
23
24// dumpLiteral expands to dumpBlock(D, s, sizeof(s) - sizeof(char)).
25// sizeof("\x1bLua") = 5; minus 1 = 4 bytes, no NUL terminator.
26// b"\x1bLua" is &[u8; 4] in Rust — no NUL — so direct use is correct.
27const LUA_SIGNATURE: &[u8] = b"\x1bLua";
28
29// With LUA_VERSION_NUM = 504 (macros.tsv):
30// (504 / 100) * 16 + 504 % 100 = 5 * 16 + 4 = 84 = 0x54
31const LUA_VERSION_NUM_DUMP_54: i32 = 504;
32const LUAC_VERSION_54: u8 =
33 ((LUA_VERSION_NUM_DUMP_54 / 100) * 16 + LUA_VERSION_NUM_DUMP_54 % 100) as u8;
34const LUAC_VERSION_55: u8 = 0x55;
35
36const LUAC_FORMAT: u8 = 0;
37
38// sizeof("\x19\x93\r\n\x1a\n") = 7; minus 1 = 6 bytes written.
39// b"\x19\x93\r\n\x1a\n" is &[u8; 6].
40const LUAC_DATA: &[u8] = b"\x19\x93\r\n\x1a\n";
41
42const LUAC_INT: i64 = 0x5678;
43
44const LUAC_NUM: f64 = 370.5;
45
46const LUAC_INT_55: i64 = -0x5678;
47
48const LUAC_INST_55: u32 = 0x12345678;
49
50const LUAC_NUM_55: f64 = -370.5;
51
52const INSTRUCTION_SIZE: u8 = size_of::<u32>() as u8;
53
54const LUA_INTEGER_SIZE: u8 = size_of::<i64>() as u8;
55
56const LUA_NUMBER_SIZE: u8 = size_of::<f64>() as u8;
57
58// ── DumpState ────────────────────────────────────────────────────────────────
59
60/// Internal state threaded through every dump operation.
61///
62///
63/// PORT NOTE: `lua_State *L` removed — it was used only for `lua_lock`/`lua_unlock`, which are
64/// no-ops in the default Lua build and dropped here (macros.tsv). `void *data` is folded into
65/// the writer closure. `int status` is replaced by `Result<(), LuaError>` propagated with `?`.
66struct DumpState<'a> {
67 /// Byte-sink callback. C original: `lua_Writer writer` + `void *data` (combined).
68 /// lua_Writer type is TBD in types.tsv; for dump we use a bare byte-slice callback.
69 writer: &'a mut dyn FnMut(&[u8]) -> Result<(), LuaError>,
70 /// When true, strip all debug information from the output.
71 strip: bool,
72 version: LuaVersion,
73}
74
75impl<'a> DumpState<'a> {
76 // ── Low-level write primitives ────────────────────────────────────────────
77
78 /// Write raw bytes to the output stream.
79 ///
80 ///
81 /// PORT NOTE: C accumulates errors in `D->status` and skips subsequent writes once
82 /// non-zero; Rust returns `Result<(), LuaError>` and short-circuits via `?`.
83 /// `lua_lock`/`lua_unlock` are no-ops in the default build and are dropped (macros.tsv).
84 fn dump_block(&mut self, data: &[u8]) -> Result<(), LuaError> {
85 if !data.is_empty() {
86 (self.writer)(data)?;
87 }
88 Ok(())
89 }
90
91 /// Write one byte.
92 ///
93 /// C body: `lu_byte x = (lu_byte)y; dumpVar(D, x);`
94 /// (`dumpVar(D,x)` expands to `dumpVector(D,&x,1)` expands to `dumpBlock(D,&x,sizeof(x))`)
95 fn dump_byte(&mut self, y: u8) -> Result<(), LuaError> {
96 self.dump_block(&[y])
97 }
98
99 /// Write a `size_t` using Lua's variable-length encoding.
100 ///
101 ///
102 /// Encoding (big-endian 7-bit groups, **last** byte marked with MSB = 1):
103 /// - Each byte holds 7 payload bits.
104 /// - Bytes are written most-significant group first.
105 /// - The final byte (least-significant group) has its MSB set as an end marker.
106 ///
107 /// This differs from standard LEB128, which marks the *continuation* bytes rather than
108 /// the terminating byte.
109 ///
110 fn dump_size(&mut self, mut x: usize) -> Result<(), LuaError> {
111 // DIBS = (usize::BITS + 6) / 7; on 64-bit = (64+6)/7 = 10.
112 const DIBS: usize = (usize::BITS as usize + 6) / 7;
113 let mut buff = [0u8; DIBS];
114 let mut n: usize = 0;
115
116 loop {
117 n += 1;
118 buff[DIBS - n] = (x & 0x7f) as u8; // fill buffer in reverse order
119 x >>= 7;
120 if x == 0 {
121 break;
122 }
123 }
124
125 // The byte at buff[DIBS-1] is the first byte placed (least-significant group).
126 // Setting its MSB marks it as the terminal byte of the encoding.
127 buff[DIBS - 1] |= 0x80;
128
129 self.dump_block(&buff[DIBS - n..])
130 }
131
132 /// Write an `int` as a variable-length size.
133 ///
134 ///
135 /// PORT NOTE: C implicitly casts `int` → `size_t`. All call sites pass non-negative values
136 /// (line numbers, instruction counts, vector lengths); a debug assertion guards this.
137 fn dump_int(&mut self, x: i32) -> Result<(), LuaError> {
138 debug_assert!(
139 x >= 0,
140 "dump_int: negative value {} cast to usize would wrap",
141 x
142 );
143 self.dump_size(x as usize)
144 }
145
146 /// Write a `lua_Number` (f64) in the platform's native byte order.
147 ///
148 ///
149 /// `dumpVar(D,x)` expands to `dumpBlock(D, &x, sizeof(lua_Number))` — 8 bytes, native order.
150 /// `to_ne_bytes()` replicates native-endian serialisation. The bytecode header's `LUAC_NUM`
151 /// sentinel (370.5) lets `lundump` detect byte-order mismatches at load time.
152 fn dump_number(&mut self, x: f64) -> Result<(), LuaError> {
153 self.dump_block(&x.to_ne_bytes())
154 }
155
156 /// Write a `lua_Integer` (i64) in the platform's native byte order.
157 ///
158 fn dump_integer(&mut self, x: i64) -> Result<(), LuaError> {
159 self.dump_block(&x.to_ne_bytes())
160 }
161
162 fn dump_raw_i32(&mut self, x: i32) -> Result<(), LuaError> {
163 self.dump_block(&x.to_ne_bytes())
164 }
165
166 fn dump_raw_u32(&mut self, x: u32) -> Result<(), LuaError> {
167 self.dump_block(&x.to_ne_bytes())
168 }
169
170 // ── Mid-level serialisers ─────────────────────────────────────────────────
171
172 /// Write an interned or long string, or a null sentinel (encoded size = 0).
173 ///
174 ///
175 /// Encoding: `dumpSize(len + 1)` followed by `len` raw bytes; size 0 means null/absent.
176 /// `tsslen(s)` → `s.len()` and `getstr(s)` → `s.as_bytes()` (macros.tsv).
177 fn dump_string(&mut self, s: Option<&GcRef<LuaString>>) -> Result<(), LuaError> {
178 match s {
179 None => self.dump_size(0),
180
181 Some(s) => {
182 let bytes = s.as_bytes(); // tsslen → .len(); getstr → .as_bytes()
183 self.dump_size(bytes.len() + 1)?;
184 self.dump_block(bytes)
185 }
186 }
187 }
188
189 /// Write the bytecode instruction array.
190 ///
191 ///
192 /// PORT NOTE: `f->sizecode` is covered by `Vec::len()` (types.tsv).
193 fn dump_code(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
194 self.dump_int(proto.code.len() as i32)?;
195
196 // dumpVector writes n * sizeof(Instruction) = n * 4 bytes in native byte order.
197 for instr in &proto.code {
198 // TODO(port): `Instruction` is a u32 newtype (types.tsv). Accessing the inner u32
199 // via `.0` assumes a tuple-struct layout. If the Instruction API differs (e.g.,
200 // exposes `.raw()` or `u32::from(*instr)`), adjust accordingly in Phase B.
201 self.dump_block(&instr.0.to_ne_bytes())?;
202 }
203 Ok(())
204 }
205
206 /// Write the constant pool.
207 ///
208 ///
209 /// Each constant is written as: one tag byte (`ttypetag`), followed by the payload
210 /// (float: 8 bytes; integer: 8 bytes; string: variable-length; nil/bool: nothing).
211 ///
212 /// PORT NOTE: `f->sizek` is covered by `Vec::len()` (types.tsv).
213 fn dump_constants(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
214 let n = proto.k.len();
215 self.dump_int(n as i32)?;
216
217 for constant in &proto.k {
218 // ttypetag(o) → o.full_type_tag() (macros.tsv)
219 // Returns the C-side tag byte: bits 0-3 base type, bits 4-5 variant, bit 6 collectable.
220 let tag = constant.full_type_tag();
221 self.dump_byte(tag)?;
222
223 match constant {
224 LuaValue::Float(f) => {
225 // fltvalue(o) → o.as_float().expect("not float") or `if let` (macros.tsv)
226 self.dump_number(*f)?;
227 }
228 LuaValue::Int(i) => {
229 self.dump_integer(*i)?;
230 }
231 LuaValue::Str(s) => {
232 // tsvalue(o) → o.as_string().expect("not string") (macros.tsv)
233 self.dump_string(Some(s))?;
234 }
235 LuaValue::Nil | LuaValue::Bool(_) => {
236 // Only the tag byte is written; nil and booleans carry no additional payload.
237 // lua_assert → debug_assert! (macros.tsv)
238 debug_assert!(
239 matches!(constant, LuaValue::Nil | LuaValue::Bool(_)),
240 "dump_constants: default branch reached for unexpected variant"
241 );
242 }
243 _ => {
244 // TODO(port): LuaValue variant not valid as a constant-pool entry.
245 // In C the default branch asserts nil/false/true only. Any other variant
246 // here indicates a malformed proto; flag for Phase B investigation.
247 debug_assert!(
248 false,
249 "dump_constants: unexpected LuaValue variant in constant pool"
250 );
251 }
252 }
253 }
254 Ok(())
255 }
256
257 /// Write nested function prototypes (sub-functions defined inside `proto`).
258 ///
259 ///
260 /// PORT NOTE: `f->sizep` is covered by `Vec::len()` (types.tsv).
261 /// The parent's source string is passed down so that children with identical source
262 /// origins can omit the redundant source name (see `dump_function`).
263 fn dump_protos(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
264 let n = proto.p.len();
265 self.dump_int(n as i32)?;
266
267 for sub in &proto.p {
268 // sub: &GcRef<LuaProto>; deref coercion (&GcRef<LuaProto> → &LuaProto) expected
269 // when GcRef<T>: Deref<Target=T> (true for Rc<T> in Phase A).
270 self.dump_function(sub, proto.source.as_ref())?;
271 }
272 Ok(())
273 }
274
275 /// Write upvalue descriptors (instack / idx / kind for each upvalue slot).
276 ///
277 ///
278 /// PORT NOTE: `f->sizeupvalues` is covered by `Vec::len()` (types.tsv).
279 /// `Upvaldesc.instack` is `bool` in Rust (types.tsv); cast to `u8` for the wire format.
280 fn dump_upvalues(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
281 let n = proto.upvalues.len();
282 self.dump_int(n as i32)?;
283
284 for upval in &proto.upvalues {
285 // PORT NOTE: instack is bool in Rust (types.tsv); cast to u8: true→1, false→0.
286 self.dump_byte(upval.instack as u8)?;
287 self.dump_byte(upval.idx)?;
288 self.dump_byte(upval.kind)?;
289 }
290 Ok(())
291 }
292
293 /// Write debug information: per-instruction line deltas, absolute line records,
294 /// local-variable lifetimes, and upvalue names.
295 ///
296 /// All counts are written as zero when `self.strip` is true.
297 ///
298 ///
299 /// PORT NOTE: all `f->size*` fields are covered by `Vec::len()` (types.tsv).
300 fn dump_debug(&mut self, proto: &LuaProto) -> Result<(), LuaError> {
301 let n_lineinfo = if self.strip { 0 } else { proto.lineinfo.len() };
302 self.dump_int(n_lineinfo as i32)?;
303
304 // lineinfo is Vec<i8> (ls_byte per types.tsv). C writes them as raw bytes (sizeof(i8)=1).
305 // Cast each i8 to u8 (same bit pattern) before writing.
306 // PERF(port): iterating one byte at a time vs. bulk write — profile in Phase B.
307 // (A bulk write would require bytemuck::cast_slice or similar to avoid unsafe.)
308 let lineinfo_bytes: Vec<u8> = proto.lineinfo[..n_lineinfo]
309 .iter()
310 .map(|&b| b as u8)
311 .collect();
312 self.dump_block(&lineinfo_bytes)?;
313
314 let n_absline = if self.strip {
315 0
316 } else {
317 proto.abslineinfo.len()
318 };
319 self.dump_int(n_absline as i32)?;
320
321 for abs in proto.abslineinfo.iter().take(n_absline) {
322 // AbsLineInfo.pc and .line are i32 (types.tsv); non-negative in valid bytecode.
323 self.dump_int(abs.pc)?;
324 self.dump_int(abs.line)?;
325 }
326
327 let n_locvars = if self.strip { 0 } else { proto.locvars.len() };
328 self.dump_int(n_locvars as i32)?;
329
330 for locvar in proto.locvars.iter().take(n_locvars) {
331 // LocVar.varname is GcRef<LuaString> (types.tsv).
332 self.dump_string(Some(&locvar.varname))?;
333 self.dump_int(locvar.startpc)?;
334 self.dump_int(locvar.endpc)?;
335 }
336
337 // (Re-uses upvalues.len() for the name-writing pass — separate from dumpUpvalues
338 // which wrote structural descriptors; here we write debug names.)
339 let n_upval_names = if self.strip { 0 } else { proto.upvalues.len() };
340 self.dump_int(n_upval_names as i32)?;
341
342 for upval in proto.upvalues.iter().take(n_upval_names) {
343 // PORT NOTE: UpvalDesc.name is GcRef<LuaString> per types.tsv (non-optional).
344 // TODO(port): In C, `TString *name` can be NULL when an upvalue is unnamed (e.g.,
345 // in bytecode compiled without debug info). Verify whether UpvalDesc.name should be
346 // `Option<GcRef<LuaString>>` in the Rust model; if so, change call to pass the Option
347 // directly instead of wrapping in Some.
348 self.dump_string(upval.name.as_ref())?;
349 }
350 Ok(())
351 }
352
353 /// Write a complete function prototype: source name, header bytes, code, constants,
354 /// upvalue descriptors, nested prototypes, and debug information.
355 ///
356 /// `psource` is the parent function's source string. When `f->source == psource` (pointer
357 /// equality — Lua interns short strings so identical source names share an object), the
358 /// source is written as null (size 0) to avoid duplication. The top-level call passes
359 /// `None` to force writing the source.
360 ///
361 ///
362 /// PORT NOTE: `f->source == psource` is a C pointer comparison exploiting string interning.
363 /// In Rust we use `GcRef::ptr_eq` (equivalent to `Rc::ptr_eq` in Phase A) for identity.
364 /// `is_vararg` is `bool` in Rust (types.tsv); cast to `u8` for the wire format.
365 fn dump_function(
366 &mut self,
367 proto: &LuaProto,
368 psource: Option<&GcRef<LuaString>>,
369 ) -> Result<(), LuaError> {
370 // Pointer-equality check: same interned string object means same source file.
371 let same_source = match (psource, proto.source.as_ref()) {
372 (Some(ps), Some(src)) => GcRef::ptr_eq(src, ps),
373 _ => false,
374 };
375
376 if self.strip || same_source {
377 self.dump_string(None)?;
378 } else {
379 self.dump_string(proto.source.as_ref())?;
380 }
381
382 self.dump_int(proto.linedefined)?;
383 self.dump_int(proto.lastlinedefined)?;
384 self.dump_byte(proto.numparams)?;
385 // PORT NOTE: is_vararg is bool in Rust (types.tsv); true → 1u8, false → 0u8.
386 self.dump_byte(proto.is_vararg as u8)?;
387 self.dump_byte(proto.maxstacksize)?;
388
389 self.dump_code(proto)?;
390 self.dump_constants(proto)?;
391 self.dump_upvalues(proto)?;
392 self.dump_protos(proto)?;
393 self.dump_debug(proto)?;
394 Ok(())
395 }
396
397 /// Write the binary chunk header.
398 ///
399 /// The header allows `lundump` (and external tools) to verify the bytecode format,
400 /// platform word sizes, and byte order before attempting to load the chunk.
401 ///
402 fn dump_header(&mut self) -> Result<(), LuaError> {
403 // dumpLiteral(D,s) = dumpBlock(D, s, sizeof(s) - sizeof(char))
404 // b"\x1bLua" is &[u8; 4] (no NUL terminator in Rust byte literals), matching the
405 // C expansion of sizeof("\x1bLua")-1 = 4 bytes.
406 self.dump_block(LUA_SIGNATURE)?;
407
408 self.dump_byte(if matches!(self.version, LuaVersion::V55) {
409 LUAC_VERSION_55
410 } else {
411 LUAC_VERSION_54
412 })?;
413
414 self.dump_byte(LUAC_FORMAT)?;
415
416 // b"\x19\x93\r\n\x1a\n" is &[u8; 6], matching sizeof(LUAC_DATA)-1 = 6 bytes.
417 self.dump_block(LUAC_DATA)?;
418
419 if matches!(self.version, LuaVersion::V55) {
420 self.dump_byte(size_of::<i32>() as u8)?;
421 self.dump_raw_i32(LUAC_INT_55 as i32)?;
422
423 self.dump_byte(INSTRUCTION_SIZE)?;
424 self.dump_raw_u32(LUAC_INST_55)?;
425
426 self.dump_byte(LUA_INTEGER_SIZE)?;
427 self.dump_integer(LUAC_INT_55)?;
428
429 self.dump_byte(LUA_NUMBER_SIZE)?;
430 self.dump_number(LUAC_NUM_55)?;
431 } else {
432 self.dump_byte(INSTRUCTION_SIZE)?;
433
434 self.dump_byte(LUA_INTEGER_SIZE)?;
435
436 self.dump_byte(LUA_NUMBER_SIZE)?;
437
438 self.dump_integer(LUAC_INT)?;
439
440 self.dump_number(LUAC_NUM)?;
441 }
442
443 Ok(())
444 }
445}
446
447// ── Public entry point ───────────────────────────────────────────────────────
448
449/// Serialize a compiled Lua function prototype as a precompiled bytecode chunk.
450///
451/// The `writer` callback receives successive slices of the serialised bytes and returns
452/// `Err(LuaError)` to abort. `strip` omits debug info (line numbers, local names, etc.)
453/// from the output.
454///
455///
456/// PORT NOTE: `lua_Writer w` (fn pointer) + `void *data` (userdata) are collapsed into a
457/// single `impl FnMut(&[u8]) -> Result<(), LuaError>` closure — the Rust idiom for the
458/// callback + context pair. `_state` is retained in the signature for API parity but unused
459/// in the body: the C code needed it only for `lua_lock`/`lua_unlock`, which are no-ops per
460/// macros.tsv. Return type changes from `int` (0 = ok, non-zero = writer error) to
461/// `Result<(), LuaError>`.
462pub(crate) fn dump(
463 state: &LuaState,
464 proto: &GcRef<LuaProto>,
465 writer: &mut dyn FnMut(&[u8]) -> Result<(), LuaError>,
466 strip: bool,
467) -> Result<(), LuaError> {
468 let mut d = DumpState {
469 writer,
470 strip,
471 version: state.global().lua_version,
472 };
473
474 d.dump_header()?;
475
476 // PORT NOTE: f->sizeupvalues is covered by Vec::len(). Bounded by MAXUPVAL = 255
477 // (macros.tsv), so truncation via `as u8` is safe for well-formed prototypes.
478 d.dump_byte(proto.upvalues.len() as u8)?;
479
480 // psource = None forces the top-level function to always write its source name.
481 // Deref coercion: &GcRef<LuaProto> → &LuaProto (via Deref<Target=LuaProto> on GcRef/Rc).
482 d.dump_function(proto, None)?;
483
484 Ok(())
485}
486
487// ────────────────────────────────────────────────────────────────────────────
488// PORT STATUS
489// source: src/ldump.c (230 lines, 10 functions)
490// target_crate: lua-vm
491// confidence: medium
492// todos: 4
493// port_notes: 12
494// unsafe_blocks: 0
495// notes: Types/imports need Phase B wiring; logic should be faithful.
496// Key uncertainties: (1) Instruction newtype inner-field access (.0 vs
497// method); (2) UpvalDesc.name optionality; (3) GcRef::ptr_eq method
498// existence. Lineinfo bulk-write is done via collect()+dump_block to
499// avoid unsafe transmute of &[i8] → &[u8]; revisit with bytemuck in
500// Phase B for performance. Native-endian serialisation via to_ne_bytes()
501// matches C's raw-memory dumpVector behaviour.
502// ────────────────────────────────────────────────────────────────────────────