qala_compiler/chunk.rs
1//! the bytecode chunk + the whole program: instruction bytes, a constant
2//! pool, and a parallel source-line map; plus the disassembler that turns
3//! them back into the human-readable listing the playground will render in
4//! its bytecode panel.
5//!
6//! one [`Chunk`] is one compiled function (or one throwaway `comptime`
7//! body). a [`Program`] holds every chunk plus the entry-point index, the
8//! globals table, and the parallel name tables the disassembler reads
9//! by index. constant-pool entries are append-only (no dedupe in v1 per
10//! Phase 4 research A6); the resulting deterministic insertion order is
11//! the load-bearing invariant for "compile twice, same bytes" testing.
12//!
13//! the write helpers ([`Chunk::write_op`], [`Chunk::write_u16`],
14//! [`Chunk::write_i16`]) are the ONLY way to append to the byte stream;
15//! they update [`Chunk::source_lines`] in lockstep so the invariant
16//! `code.len() == source_lines.len()` holds after every emission. the
17//! peephole optimizer (Plan 04-05) uses `Vec::splice` directly on both
18//! vectors, with the same lockstep discipline.
19//!
20//! forward jumps use [`Chunk::emit_jump`] / [`Chunk::patch_jump`] -- the
21//! Crafting Interpreters back-patching pair (Phase 4 research Pattern 4);
22//! backward jumps use [`Chunk::emit_loop`] when the target is already
23//! emitted. both error with [`crate::errors::QalaError::Parse`] +
24//! "branch too far" on i16 range overflow (Phase 4 research A1).
25//!
26//! the disassembler ([`Chunk::disassemble`] + [`Program::disassemble`])
27//! is the documented contract for Phase 6's WASM bridge and Phase 8's
28//! bytecode panel. byte-determinism is part of the contract; the inline
29//! `compile_twice_disassemble_twice` test guards it.
30
31use crate::errors::QalaError;
32use crate::opcode::Opcode;
33use crate::span::Span;
34use crate::value::ConstValue;
35use std::fmt::Write as _;
36
37/// a compiled function (or top-level chunk): instruction bytes + a parallel
38/// per-byte source-line map + a constant pool indexed by u16 from the bytes.
39///
40/// the three vectors stay in lockstep; the [`Chunk::write_op`] /
41/// [`Chunk::write_u16`] / [`Chunk::write_i16`] helpers are the only way to
42/// append to `code`, so the invariant is structurally enforced.
43#[derive(Debug, Clone, Default)]
44pub struct Chunk {
45 /// the instruction byte stream; one byte per opcode, plus per-operand
46 /// bytes per [`Opcode::operand_bytes`].
47 pub code: Vec<u8>,
48 /// the per-function constant pool, indexed by u16 from CONST / FIELD /
49 /// MAKE_ENUM_VARIANT operands. append-only -- a rewound CONST leaves a
50 /// dangling entry, harmless because disassemble walks instructions, not
51 /// pool entries. holds [`ConstValue`] entries; the VM converts each to a
52 /// runtime [`crate::value::Value`] when it executes the CONST.
53 pub constants: Vec<ConstValue>,
54 /// the 1-based source line of the byte at the same index in `code`.
55 /// multi-byte instructions duplicate the line across operand bytes so
56 /// `source_lines[i]` is valid for any `i in 0..code.len()`.
57 pub source_lines: Vec<u32>,
58 /// the source name of each local slot, indexed by slot number (the
59 /// GET_LOCAL / SET_LOCAL operand). `local_names[i]` is the name the
60 /// programmer wrote for slot `i` -- `"x"`, `"sum"`, a `for` loop variable,
61 /// a `match` payload binding. a compiler-synthesized temporary (a hidden
62 /// `for`-loop array/index slot) has an empty string; the VM falls back to
63 /// `slot{i}` for those. codegen fills this when it finalizes a function's
64 /// chunk; the VM's `get_state` reads it to surface the in-scope variables
65 /// with their real names rather than slot indices. a chunk built by hand
66 /// (a test, a `comptime` body) leaves it empty -- `Default` gives an empty
67 /// vec, so older callers are unaffected.
68 pub local_names: Vec<String>,
69}
70
71/// one declared struct's runtime identity: its name and its field count.
72///
73/// the [`Program::structs`] table holds one of these per declared struct;
74/// a MAKE_STRUCT opcode's `u16` operand indexes it. the VM reads `name` to
75/// label the heap struct it builds (so `type_of` of a struct returns the
76/// declared name) and `field_count` to know how many values to pop. codegen
77/// fills the table as it emits struct literals.
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct StructInfo {
80 /// the struct's declared name, e.g. `"Point"`. the VM stores this as the
81 /// heap struct's `type_name`.
82 pub name: String,
83 /// the number of fields the struct declares. MAKE_STRUCT pops exactly
84 /// this many values off the stack to build the struct.
85 pub field_count: u16,
86}
87
88/// a whole compiled program: one chunk per function, with the entry-point
89/// index and a globals table.
90///
91/// function indexes in CALL opcodes are indexes into [`Program::chunks`];
92/// matching name lookups for the disassembler are in [`Program::fn_names`].
93/// enum variant ids in MAKE_ENUM_VARIANT and MATCH_VARIANT are indexes into
94/// [`Program::enum_variant_names`]. struct ids in MAKE_STRUCT are indexes
95/// into [`Program::structs`].
96#[derive(Debug, Clone, Default)]
97pub struct Program {
98 /// one chunk per function, indexed by function id (u16 in CALL operands).
99 pub chunks: Vec<Chunk>,
100 /// the chunks index where execution starts. Phase 4 codegen sets this
101 /// to the index of the `main` function.
102 pub main_index: usize,
103 /// global names indexed by GET_GLOBAL / SET_GLOBAL u16 operands. v1 has
104 /// no top-level `let` so this is empty in practice -- kept for forward
105 /// compatibility.
106 pub globals: Vec<String>,
107 /// function names parallel to `chunks` -- the disassembler renders
108 /// CALL #fn_id(name).
109 pub fn_names: Vec<String>,
110 /// (enum_name, variant_name) pairs indexed by MAKE_ENUM_VARIANT /
111 /// MATCH_VARIANT u16 operands. the disassembler renders
112 /// #variant_id(Enum::Variant).
113 pub enum_variant_names: Vec<(String, String)>,
114 /// one [`StructInfo`] per declared struct, indexed by MAKE_STRUCT u16
115 /// operands. records each struct's name and field count so the VM can
116 /// build a heap struct with the right `type_name` and pop the right
117 /// number of fields. `Program::default()` gives an empty vec.
118 pub structs: Vec<StructInfo>,
119}
120
121impl Chunk {
122 /// construct an empty chunk. all three vectors start empty; the
123 /// lockstep invariant `code.len() == source_lines.len()` holds
124 /// trivially.
125 pub fn new() -> Self {
126 Self::default()
127 }
128
129 /// emit a zero-operand opcode. `line` is the 1-based source line of the
130 /// originating expression (looked up via `LineIndex::location` at the
131 /// call site). pushes one byte to `code` and one entry to `source_lines`
132 /// so the invariant `code.len() == source_lines.len()` is preserved.
133 pub fn write_op(&mut self, op: Opcode, line: u32) {
134 self.code.push(op as u8);
135 self.source_lines.push(line);
136 }
137
138 /// emit a u16 operand, little-endian. `line` is the same line as the
139 /// preceding opcode -- the map duplicates the line per byte so
140 /// `source_lines[i]` is valid for any `i in 0..code.len()`.
141 pub fn write_u16(&mut self, v: u16, line: u32) {
142 let bytes = v.to_le_bytes();
143 self.code.push(bytes[0]);
144 self.code.push(bytes[1]);
145 self.source_lines.push(line);
146 self.source_lines.push(line);
147 }
148
149 /// emit a signed i16 operand for relative jump offsets. uses
150 /// two's-complement little-endian, the same byte sequence on every
151 /// platform Rust supports. delegates to [`Chunk::write_u16`] via a
152 /// `v as u16` cast (the bit pattern is identical).
153 pub fn write_i16(&mut self, v: i16, line: u32) {
154 self.write_u16(v as u16, line);
155 }
156
157 /// read a u16 operand at byte offset `off`. the inverse of
158 /// [`Chunk::write_u16`]. callers must ensure `off + 2 <= code.len()`.
159 pub fn read_u16(&self, off: usize) -> u16 {
160 u16::from_le_bytes([self.code[off], self.code[off + 1]])
161 }
162
163 /// read an i16 operand at byte offset `off`. the inverse of
164 /// [`Chunk::write_i16`]. callers must ensure `off + 2 <= code.len()`.
165 pub fn read_i16(&self, off: usize) -> i16 {
166 i16::from_le_bytes([self.code[off], self.code[off + 1]])
167 }
168
169 /// append a value to the constant pool and return its u16 index. the
170 /// pool is append-only (no dedupe in v1 per Phase 4 research A6);
171 /// determinism follows from this. the assumption is that 65536
172 /// constants per chunk is more than realistic; debug builds assert
173 /// the cap, production builds keep the cast for v1. widening to u32
174 /// is a v2 concern.
175 pub fn add_constant(&mut self, v: ConstValue) -> u16 {
176 let idx = self.constants.len();
177 debug_assert!(
178 idx <= u16::MAX as usize,
179 "constant pool overflow -- chunk has >65536 constants"
180 );
181 self.constants.push(v);
182 idx as u16
183 }
184
185 /// peek the most recent CONST emission. returns `(byte_offset, pool_idx)`
186 /// when the last full instruction in `code` is a CONST + u16 operand;
187 /// `None` otherwise.
188 ///
189 /// the constant folder in codegen uses this before emitting a binary
190 /// opcode: when both operands are CONSTs the folder can compute the
191 /// result at compile time.
192 pub fn last_const(&self) -> Option<(usize, u16)> {
193 let n = self.code.len();
194 if n < 3 {
195 return None;
196 }
197 if self.code[n - 3] != (Opcode::Const as u8) {
198 return None;
199 }
200 let idx = u16::from_le_bytes([self.code[n - 2], self.code[n - 1]]);
201 Some((n - 3, idx))
202 }
203
204 /// peek the CONST emission BEFORE the most recent CONST. returns
205 /// `(byte_offset, pool_idx)` when the chunk ends in two consecutive
206 /// `CONST u16` instructions; `None` otherwise. used by the binary-
207 /// expression constant folder to peek both operands before deciding
208 /// to fold.
209 pub fn second_to_last_const(&self) -> Option<(usize, u16)> {
210 let (off1, _) = self.last_const()?;
211 if off1 < 3 {
212 return None;
213 }
214 if self.code[off1 - 3] != (Opcode::Const as u8) {
215 return None;
216 }
217 let idx = u16::from_le_bytes([self.code[off1 - 2], self.code[off1 - 1]]);
218 Some((off1 - 3, idx))
219 }
220
221 /// rewind the most recent CONST: drop its 3 bytes from `code` and 3
222 /// entries from `source_lines`, returning the pool index. the pool
223 /// itself is NOT shrunk -- the rewound entry remains in
224 /// [`Chunk::constants`] as a harmless orphan. the determinism contract
225 /// (append-only pool) depends on never mutating prior pool entries.
226 ///
227 /// returns `None` when the chunk does not end with a CONST instruction
228 /// (i.e. [`Chunk::last_const`] returns `None`). callers that have
229 /// already peeked via `last_const` / `second_to_last_const` will
230 /// always receive `Some`; the `None` path exists so the WASM build
231 /// cannot panic on misuse.
232 pub fn rewind_last_const(&mut self) -> Option<u16> {
233 let (off, idx) = self.last_const()?;
234 self.code.truncate(off);
235 self.source_lines.truncate(off);
236 Some(idx)
237 }
238
239 /// emit a forward jump with a placeholder i16 operand. returns the
240 /// byte position of the FIRST operand byte so the caller can later
241 /// [`Chunk::patch_jump`] it to the real target.
242 ///
243 /// uses [`i16::MAX`] as the placeholder so an unpatched offset is
244 /// visually obvious in a disassembly. callers MUST patch the jump
245 /// before completing the chunk; an unpatched jump is a codegen bug.
246 pub fn emit_jump(&mut self, op: Opcode, line: u32) -> usize {
247 self.write_op(op, line);
248 let pos = self.code.len();
249 self.write_i16(i16::MAX, line);
250 pos
251 }
252
253 /// patch the i16 operand bytes at `pos` to reach the current end of
254 /// `code`. the offset is computed relative to the byte AFTER the
255 /// operand (i.e. `code.len() - pos - 2`), matching the VM's branch
256 /// arithmetic per Crafting Interpreters chapter 23.
257 ///
258 /// returns `Err(QalaError::Parse { ..., "branch too far" })` when the
259 /// computed offset would not fit in i16. this is a v1 limitation
260 /// (Phase 4 research A1); widening to i32 is a v2 concern.
261 pub fn patch_jump(&mut self, pos: usize) -> Result<(), QalaError> {
262 let target = self.code.len();
263 let jump = target as isize - pos as isize - 2;
264 if jump < i16::MIN as isize || jump > i16::MAX as isize {
265 return Err(QalaError::Parse {
266 span: Span::new(0, 0),
267 message: format!("branch too far -- chunk exceeds i16 jump range ({jump} bytes)"),
268 });
269 }
270 let bytes = (jump as i16).to_le_bytes();
271 self.code[pos] = bytes[0];
272 self.code[pos + 1] = bytes[1];
273 Ok(())
274 }
275
276 /// emit a backward jump whose target is `loop_start`. unlike
277 /// [`Chunk::emit_jump`], the offset is known up-front so no patching
278 /// is needed. the offset is computed from the byte AFTER the 2-byte
279 /// operand back to `loop_start` (i.e. `loop_start - pos - 3` where
280 /// `pos` is the byte position of the JUMP opcode itself).
281 ///
282 /// returns the same `Err(QalaError::Parse { ..., "branch too far" })`
283 /// when the offset exceeds i16 range.
284 pub fn emit_loop(&mut self, loop_start: usize, line: u32) -> Result<(), QalaError> {
285 let pos = self.code.len();
286 let offset = loop_start as isize - pos as isize - 3;
287 if offset < i16::MIN as isize || offset > i16::MAX as isize {
288 return Err(QalaError::Parse {
289 span: Span::new(0, 0),
290 message: format!("branch too far -- chunk exceeds i16 jump range ({offset} bytes)"),
291 });
292 }
293 self.write_op(Opcode::Jump, line);
294 self.write_i16(offset as i16, line);
295 Ok(())
296 }
297
298 /// produce the human-readable listing for this chunk. one line per
299 /// instruction: byte offset, opcode name (uppercase), operand
300 /// interpretation, source line. the format is the documented contract
301 /// for Phase 6's WASM bridge and Phase 8's playground bytecode panel.
302 ///
303 /// byte-deterministic: same input bytes produce byte-identical output
304 /// on every run. no HashMap iteration, no debug formatters; constants
305 /// and program-side name tables are accessed by direct index.
306 ///
307 /// the `program` argument supplies the function-name and enum-variant-
308 /// name lookup tables; the chunk's own constant pool supplies CONST
309 /// operand renderings.
310 pub fn disassemble(&self, program: &Program) -> String {
311 let mut out = String::new();
312 let mut off = 0usize;
313 while off < self.code.len() {
314 let byte = self.code[off];
315 let line = self.source_lines[off];
316 match Opcode::from_u8(byte) {
317 None => {
318 // defensive against a corrupted byte stream (e.g. via the
319 // WASM bridge); never produced by codegen.
320 let _ = writeln!(out, "{off:>4} <unknown:{byte:#x}> ; line {line}");
321 off += 1;
322 continue;
323 }
324 Some(op) => {
325 let operand = self.operand_string(op, off, program);
326 let _ = writeln!(
327 out,
328 "{off:>4} {name:<9} {operand:<24} ; line {line}",
329 name = op.name(),
330 );
331 off += 1 + op.operand_bytes() as usize;
332 }
333 }
334 }
335 out
336 }
337
338 /// render the operand string for the opcode starting at `off` in
339 /// `code`. caller has already verified `Opcode::from_u8(code[off])`
340 /// is `Some(op)`. the formatting table is the locked playground
341 /// contract per the Phase 4 plan.
342 fn operand_string(&self, op: Opcode, off: usize, program: &Program) -> String {
343 match op {
344 // CONST(u16 idx) -> `#idx value`. the constant lives in the chunk.
345 Opcode::Const => {
346 let idx = self.read_u16(off + 1);
347 let val = self
348 .constants
349 .get(idx as usize)
350 .map(|v| v.to_string())
351 .unwrap_or_else(|| "?".to_string());
352 format!("#{idx} {val}")
353 }
354 // GET_LOCAL / SET_LOCAL (u16 slot) -> `slot`.
355 Opcode::GetLocal | Opcode::SetLocal => {
356 let slot = self.read_u16(off + 1);
357 format!("{slot}")
358 }
359 // GET_GLOBAL / SET_GLOBAL (u16 idx) -> `#idx(name)`.
360 Opcode::GetGlobal | Opcode::SetGlobal => {
361 let idx = self.read_u16(off + 1);
362 let name = program
363 .globals
364 .get(idx as usize)
365 .map(String::as_str)
366 .unwrap_or("?");
367 format!("#{idx}({name})")
368 }
369 // JUMP / JUMP_IF_FALSE / JUMP_IF_TRUE (i16 offset) ->
370 // `+offset (-> target)` where target = (off + 1 + 2) + offset.
371 Opcode::Jump | Opcode::JumpIfFalse | Opcode::JumpIfTrue => {
372 let offset = self.read_i16(off + 1);
373 let base = (off + 1 + 2) as isize;
374 let target = base + offset as isize;
375 format!("{offset:+} (-> {target})")
376 }
377 // CALL (u16 fn_id + u8 argc) -> `#fn_id(name)/argc`.
378 Opcode::Call => {
379 let fn_id = self.read_u16(off + 1);
380 let argc = self.code[off + 3];
381 let name = program
382 .fn_names
383 .get(fn_id as usize)
384 .map(String::as_str)
385 .unwrap_or("?");
386 format!("#{fn_id}({name})/{argc}")
387 }
388 // MAKE_ARRAY / MAKE_TUPLE (u16 count) -> `count`.
389 Opcode::MakeArray | Opcode::MakeTuple => {
390 let count = self.read_u16(off + 1);
391 format!("{count}")
392 }
393 // MAKE_STRUCT (u16 struct_id) -> `#id(StructName)`. the operand
394 // indexes Program.structs; a bad index renders `?`, matching the
395 // other name-table lookups.
396 Opcode::MakeStruct => {
397 let id = self.read_u16(off + 1);
398 let name = program
399 .structs
400 .get(id as usize)
401 .map(|s| s.name.as_str())
402 .unwrap_or("?");
403 format!("#{id}({name})")
404 }
405 // MAKE_ENUM_VARIANT (u16 variant_id + u8 payload_count) ->
406 // `#variant_id(Enum::Variant)/payload_count`.
407 Opcode::MakeEnumVariant => {
408 let variant_id = self.read_u16(off + 1);
409 let payload = self.code[off + 3];
410 let (enum_name, variant_name) = program
411 .enum_variant_names
412 .get(variant_id as usize)
413 .map(|(e, v)| (e.as_str(), v.as_str()))
414 .unwrap_or(("?", "?"));
415 format!("#{variant_id}({enum_name}::{variant_name})/{payload}")
416 }
417 // FIELD (u16 name_index) -> `#idx`. v1 does not yet track field
418 // names beyond the constant pool; the raw index is shown. the
419 // playground can decorate this in Phase 8 if needed.
420 Opcode::Field => {
421 let idx = self.read_u16(off + 1);
422 format!("#{idx}")
423 }
424 // CONCAT_N (u16 count) -> `count`.
425 Opcode::ConcatN => {
426 let count = self.read_u16(off + 1);
427 format!("{count}")
428 }
429 // MATCH_VARIANT (u16 variant_id + i16 offset) ->
430 // `#variant_id(Enum::Variant) miss-> target`. target is computed
431 // from the byte AFTER the 4-byte operand layout.
432 Opcode::MatchVariant => {
433 let variant_id = self.read_u16(off + 1);
434 let offset = self.read_i16(off + 3);
435 let (enum_name, variant_name) = program
436 .enum_variant_names
437 .get(variant_id as usize)
438 .map(|(e, v)| (e.as_str(), v.as_str()))
439 .unwrap_or(("?", "?"));
440 let base = (off + 1 + 4) as isize;
441 let target = base + offset as isize;
442 format!("#{variant_id}({enum_name}::{variant_name}) miss-> {target}")
443 }
444 // zero-operand opcodes: empty operand string.
445 Opcode::Pop
446 | Opcode::Dup
447 | Opcode::Add
448 | Opcode::Sub
449 | Opcode::Mul
450 | Opcode::Div
451 | Opcode::Mod
452 | Opcode::Neg
453 | Opcode::FAdd
454 | Opcode::FSub
455 | Opcode::FMul
456 | Opcode::FDiv
457 | Opcode::FNeg
458 | Opcode::Eq
459 | Opcode::Ne
460 | Opcode::Lt
461 | Opcode::Le
462 | Opcode::Gt
463 | Opcode::Ge
464 | Opcode::FEq
465 | Opcode::FNe
466 | Opcode::FLt
467 | Opcode::FLe
468 | Opcode::FGt
469 | Opcode::FGe
470 | Opcode::Not
471 | Opcode::Return
472 | Opcode::Index
473 | Opcode::Len
474 | Opcode::ToStr
475 | Opcode::Halt => String::new(),
476 }
477 }
478}
479
480impl Program {
481 /// construct an empty program. all vectors start empty; `main_index`
482 /// defaults to 0.
483 pub fn new() -> Self {
484 Self::default()
485 }
486
487 /// produce the human-readable listing for the whole program: one
488 /// header line per chunk followed by the chunk's body. byte-
489 /// deterministic across runs by construction (iterates `chunks` in
490 /// index order; no HashMap iteration). the format matches the locked
491 /// playground bytecode-panel contract.
492 pub fn disassemble(&self) -> String {
493 let mut out = String::new();
494 for (i, chunk) in self.chunks.iter().enumerate() {
495 let name = self.fn_names.get(i).map(String::as_str).unwrap_or("?");
496 let _ = writeln!(out, "== chunk: {name} (fn_id {i}) ==");
497 out.push_str(&chunk.disassemble(self));
498 if i + 1 < self.chunks.len() {
499 out.push('\n');
500 }
501 }
502 out
503 }
504
505 /// run the peephole optimizer over every chunk in this program. invokes
506 /// [`crate::optimizer::peephole`]. idempotent: a second call does no
507 /// work, because the first pass already collapsed every matchable
508 /// instruction window.
509 ///
510 /// optimization is opt-in -- the disassembler can show un-optimized
511 /// bytecode by skipping this call, which supports the playground's
512 /// "show me what the compiler did" use case.
513 pub fn optimize(&mut self) {
514 for chunk in &mut self.chunks {
515 crate::optimizer::peephole(chunk);
516 }
517 }
518}
519
520#[cfg(test)]
521mod tests {
522 use super::*;
523
524 /// build a chunk that emits a single CONST instruction for `v` on
525 /// line 1. the constant lives at pool index 0.
526 fn chunk_with_const(v: ConstValue) -> Chunk {
527 let mut c = Chunk::new();
528 let idx = c.add_constant(v);
529 c.write_op(Opcode::Const, 1);
530 c.write_u16(idx, 1);
531 c
532 }
533
534 /// build a one-chunk Program named "main" wrapping `chunk` so the
535 /// disassembler has the parallel name table it needs.
536 fn program_with_chunk(chunk: Chunk) -> Program {
537 let mut p = Program::new();
538 p.chunks.push(chunk);
539 p.fn_names.push("main".to_string());
540 p
541 }
542
543 // Test 1: the lockstep invariant -- after any sequence of
544 // write_op/write_u16/write_i16 calls, code.len() == source_lines.len().
545 #[test]
546 fn code_and_source_lines_stay_in_lockstep_after_mixed_writes() {
547 let mut c = Chunk::new();
548 c.write_op(Opcode::Const, 1);
549 c.write_u16(0, 1);
550 c.write_op(Opcode::Pop, 2);
551 c.write_op(Opcode::Const, 3);
552 c.write_u16(1, 3);
553 c.write_op(Opcode::Halt, 4);
554 // CONST (1 byte + 2-byte u16) + POP (1) + CONST (1 + 2) + HALT (1)
555 // = 3 + 1 + 3 + 1 = 8 bytes.
556 assert_eq!(c.code.len(), 8);
557 assert_eq!(c.code.len(), c.source_lines.len());
558 // every byte has the line of its originating instruction; multi-
559 // byte instructions duplicate the line across operand bytes.
560 assert_eq!(c.source_lines, vec![1, 1, 1, 2, 3, 3, 3, 4]);
561 }
562
563 // Test 2: write_u16 + read_u16 round-trip across the full u16 range.
564 #[test]
565 fn write_u16_then_read_u16_is_a_round_trip_with_little_endian_bytes() {
566 for v in [0u16, 1, 0xFF, 0x100, 0x1234, 0xFFFF] {
567 let mut c = Chunk::new();
568 c.write_u16(v, 1);
569 assert_eq!(c.read_u16(0), v, "round-trip failed for {v:#x}");
570 }
571 // verify little-endian byte order explicitly: 0x1234 -> [0x34, 0x12].
572 let mut c = Chunk::new();
573 c.write_u16(0x1234, 1);
574 assert_eq!(c.code, vec![0x34, 0x12]);
575 }
576
577 // Test 3: write_i16 + read_i16 round-trip across the i16 range.
578 #[test]
579 fn write_i16_then_read_i16_is_a_round_trip_with_twos_complement() {
580 for v in [0i16, 1, -1, i16::MAX, i16::MIN, -32, 32] {
581 let mut c = Chunk::new();
582 c.write_i16(v, 1);
583 assert_eq!(c.read_i16(0), v, "round-trip failed for {v}");
584 }
585 }
586
587 // Test 4: add_constant returns sequential indices, no dedupe.
588 #[test]
589 fn add_constant_returns_sequential_indices_without_dedupe() {
590 let mut c = Chunk::new();
591 assert_eq!(c.add_constant(ConstValue::I64(1)), 0);
592 assert_eq!(c.add_constant(ConstValue::I64(2)), 1);
593 assert_eq!(c.add_constant(ConstValue::Str("x".to_string())), 2);
594 // NO dedupe: the same value pushed twice gets a new index.
595 assert_eq!(c.add_constant(ConstValue::I64(1)), 3);
596 assert_eq!(c.constants.len(), 4);
597 }
598
599 // Test 5: last_const returns None when the trailing op is not CONST,
600 // Some when it is.
601 #[test]
602 fn last_const_returns_none_when_trailing_op_is_not_const() {
603 let mut c = Chunk::new();
604 c.write_op(Opcode::Const, 1);
605 c.write_u16(7, 1);
606 c.write_op(Opcode::Pop, 2);
607 // last op is POP, not CONST.
608 assert_eq!(c.last_const(), None);
609 }
610
611 #[test]
612 fn last_const_returns_offset_and_index_for_trailing_const_and_rewind_clears_it() {
613 let mut c = Chunk::new();
614 c.write_op(Opcode::Const, 1);
615 c.write_u16(7, 1);
616 assert_eq!(c.last_const(), Some((0, 7)));
617 let idx = c
618 .rewind_last_const()
619 .expect("rewind on a trailing CONST returns Some");
620 assert_eq!(idx, 7);
621 assert_eq!(c.code.len(), 0);
622 assert_eq!(c.source_lines.len(), 0);
623 }
624
625 // Test 6: second_to_last_const peeks the CONST before the last CONST.
626 #[test]
627 fn second_to_last_const_peeks_the_const_before_the_trailing_const() {
628 let mut c = Chunk::new();
629 c.write_op(Opcode::Const, 1);
630 c.write_u16(0, 1);
631 c.write_op(Opcode::Const, 1);
632 c.write_u16(1, 1);
633 c.write_op(Opcode::Const, 1);
634 c.write_u16(2, 1);
635 // last_const() peeks the third one; second_to_last_const peeks the
636 // second (byte offset 3, pool index 1).
637 assert_eq!(c.last_const(), Some((6, 2)));
638 assert_eq!(c.second_to_last_const(), Some((3, 1)));
639 }
640
641 #[test]
642 fn second_to_last_const_is_none_when_only_one_const_is_in_the_chunk() {
643 let mut c = Chunk::new();
644 c.write_op(Opcode::Const, 1);
645 c.write_u16(0, 1);
646 assert_eq!(c.last_const(), Some((0, 0)));
647 // only one CONST in the chunk; nothing before it.
648 assert_eq!(c.second_to_last_const(), None);
649 }
650
651 // Test 7: the lockstep invariant survives a rewind.
652 #[test]
653 fn lockstep_invariant_survives_rewind_last_const() {
654 let mut c = Chunk::new();
655 c.write_op(Opcode::Const, 1);
656 c.write_u16(0, 1);
657 c.write_op(Opcode::Const, 2);
658 c.write_u16(1, 2);
659 assert_eq!(c.code.len(), c.source_lines.len());
660 // rewind the trailing CONST; one CONST remains.
661 let idx = c
662 .rewind_last_const()
663 .expect("rewind on a trailing CONST returns Some");
664 assert_eq!(idx, 1);
665 assert_eq!(c.code.len(), 3);
666 assert_eq!(c.source_lines.len(), 3);
667 assert_eq!(c.code.len(), c.source_lines.len());
668 }
669
670 // Test 7b: rewind_last_const returns None when there is no trailing CONST.
671 #[test]
672 fn rewind_last_const_returns_none_when_no_trailing_const() {
673 let mut c = Chunk::new();
674 // empty chunk
675 assert_eq!(c.rewind_last_const(), None);
676 // chunk ending in a non-CONST instruction
677 c.write_op(Opcode::Pop, 1);
678 assert_eq!(c.rewind_last_const(), None);
679 assert_eq!(c.code.len(), 1, "code must be unchanged after None return");
680 }
681
682 // Test 8: emit_jump returns the operand offset, placeholder is i16::MAX.
683 #[test]
684 fn emit_jump_returns_the_operand_offset_and_writes_i16_max_placeholder() {
685 let mut c = Chunk::new();
686 let pos = c.emit_jump(Opcode::JumpIfFalse, 5);
687 // the JUMP_IF_FALSE opcode is at byte 0; the operand starts at byte 1.
688 assert_eq!(pos, 1);
689 assert_eq!(c.read_i16(pos), i16::MAX);
690 // total emission is 3 bytes (opcode + 2-byte operand).
691 assert_eq!(c.code.len(), 3);
692 assert_eq!(c.source_lines, vec![5, 5, 5]);
693 }
694
695 // Test 9: patch_jump fixes the offset relative to the byte AFTER the
696 // operand.
697 #[test]
698 fn patch_jump_writes_the_offset_to_the_current_end_of_code() {
699 let mut c = Chunk::new();
700 let pos = c.emit_jump(Opcode::JumpIfFalse, 5);
701 // grow the chunk by 4 more bytes so the jump target is at offset 7.
702 c.write_op(Opcode::Const, 6);
703 c.write_u16(0, 6);
704 c.write_op(Opcode::Pop, 6);
705 // patch the jump to the current end (offset 7).
706 // expected offset = 7 - pos - 2 = 7 - 1 - 2 = 4.
707 c.patch_jump(pos).unwrap();
708 assert_eq!(c.read_i16(pos), 4);
709 }
710
711 // Test 10: patch_jump errors on out-of-range.
712 #[test]
713 fn patch_jump_returns_branch_too_far_when_offset_exceeds_i16_max() {
714 let mut c = Chunk::new();
715 // pre-fill the chunk with bytes in lockstep so the jump target is
716 // unreachable in i16. resize to slightly beyond i16::MAX so the
717 // distance from operand_offset+2 exceeds i16::MAX.
718 let len = (i16::MAX as usize) + 100;
719 c.code.resize(len, 0);
720 c.source_lines.resize(len, 0);
721 // pretend pos=0 carries an unpatched operand; the distance from 2
722 // to len (= len - 2) is > i16::MAX.
723 let pos = 0usize;
724 match c.patch_jump(pos) {
725 Err(QalaError::Parse { message, .. }) => {
726 assert!(
727 message.contains("branch too far"),
728 "expected 'branch too far' in message, got: {message:?}"
729 );
730 }
731 other => panic!("expected QalaError::Parse, got: {other:?}"),
732 }
733 }
734
735 // Test 11: emit_loop emits a negative offset that lands at loop_start.
736 #[test]
737 fn emit_loop_writes_a_negative_offset_that_lands_at_loop_start() {
738 let mut c = Chunk::new();
739 // pretend the loop entry is at byte 0; grow the chunk to byte 10.
740 let loop_start = 0usize;
741 // grow with a CONST instruction + some padding.
742 c.write_op(Opcode::Const, 1);
743 c.write_u16(0, 1); // bytes 0..=2
744 c.write_op(Opcode::Const, 1);
745 c.write_u16(1, 1); // bytes 3..=5
746 c.write_op(Opcode::Const, 1);
747 c.write_u16(2, 1); // bytes 6..=8
748 c.write_op(Opcode::Pop, 1); // byte 9
749 assert_eq!(c.code.len(), 10);
750 // emit_loop writes JUMP + i16 offset at byte 10; offset = 0 - 10 - 3 = -13.
751 c.emit_loop(loop_start, 2).unwrap();
752 let operand_offset = 11; // byte after the JUMP opcode at 10.
753 assert_eq!(c.read_i16(operand_offset), -13);
754 // VM arithmetic: pc = (operand_offset + 2) + offset = 13 + (-13) = 0 = loop_start.
755 }
756
757 // Test 12: emit_loop errors on out-of-range backward jumps.
758 #[test]
759 fn emit_loop_returns_branch_too_far_when_backward_distance_exceeds_i16_min() {
760 let mut c = Chunk::new();
761 // grow the chunk to be far past i16::MAX bytes from the start.
762 let len = (i16::MAX as usize) + 100;
763 c.code.resize(len, 0);
764 c.source_lines.resize(len, 0);
765 // attempt a backward jump to byte 0; the offset is more negative
766 // than i16::MIN.
767 match c.emit_loop(0, 1) {
768 Err(QalaError::Parse { message, .. }) => {
769 assert!(
770 message.contains("branch too far"),
771 "expected 'branch too far' in message, got: {message:?}"
772 );
773 }
774 other => panic!("expected QalaError::Parse, got: {other:?}"),
775 }
776 }
777
778 // Test 13: disassemble of an empty chunk produces an empty string.
779 #[test]
780 fn disassemble_of_an_empty_chunk_produces_an_empty_string() {
781 let c = Chunk::new();
782 let p = Program::new();
783 assert_eq!(c.disassemble(&p), "");
784 }
785
786 // Test 14: disassemble of a single CONST instruction matches the locked
787 // format.
788 #[test]
789 fn disassemble_of_one_const_instruction_matches_the_locked_format() {
790 let c = chunk_with_const(ConstValue::I64(42));
791 let p = program_with_chunk(c.clone());
792 let out = c.disassemble(&p);
793 // locked format: byte-offset right-aligned 4, opcode name padded 9,
794 // operand padded 24, then "; line N\n".
795 // operand for CONST is "#0 42".
796 assert_eq!(out, " 0 CONST #0 42 ; line 1\n");
797 }
798
799 // Test 15: every operand kind renders per the locked table.
800 #[test]
801 fn disassemble_renders_every_operand_kind_per_the_locked_table() {
802 let mut p = Program::new();
803 p.globals.push("g".to_string());
804 p.fn_names.push("main".to_string());
805 p.fn_names.push("callee".to_string());
806 p.enum_variant_names
807 .push(("Shape".to_string(), "Circle".to_string()));
808
809 let mut c = Chunk::new();
810 // CONST (u16 idx)
811 let const_idx = c.add_constant(ConstValue::I64(7));
812 c.write_op(Opcode::Const, 1);
813 c.write_u16(const_idx, 1);
814 // GET_LOCAL 0
815 c.write_op(Opcode::GetLocal, 2);
816 c.write_u16(0, 2);
817 // GET_GLOBAL 0
818 c.write_op(Opcode::GetGlobal, 3);
819 c.write_u16(0, 3);
820 // JUMP +5
821 c.write_op(Opcode::Jump, 4);
822 c.write_i16(5, 4);
823 // CALL #1(callee)/2
824 c.write_op(Opcode::Call, 5);
825 c.write_u16(1, 5);
826 c.code.push(2);
827 c.source_lines.push(5);
828 // MAKE_ARRAY 3
829 c.write_op(Opcode::MakeArray, 6);
830 c.write_u16(3, 6);
831 // MAKE_ENUM_VARIANT #0(Shape::Circle)/1
832 c.write_op(Opcode::MakeEnumVariant, 7);
833 c.write_u16(0, 7);
834 c.code.push(1);
835 c.source_lines.push(7);
836 // FIELD #2
837 c.write_op(Opcode::Field, 8);
838 c.write_u16(2, 8);
839 // CONCAT_N 4
840 c.write_op(Opcode::ConcatN, 9);
841 c.write_u16(4, 9);
842 // MATCH_VARIANT #0 miss-> target
843 c.write_op(Opcode::MatchVariant, 10);
844 c.write_u16(0, 10);
845 c.write_i16(8, 10);
846
847 p.chunks.push(c.clone());
848 let out = c.disassemble(&p);
849 // each rendered fragment must appear somewhere in the output.
850 assert!(
851 out.contains("CONST #0 7"),
852 "missing CONST line in:\n{out}"
853 );
854 assert!(
855 out.contains("GET_LOCAL 0"),
856 "missing GET_LOCAL line in:\n{out}"
857 );
858 assert!(
859 out.contains("GET_GLOBAL #0(g)"),
860 "missing GET_GLOBAL line in:\n{out}"
861 );
862 assert!(
863 out.contains("JUMP +5"),
864 "missing JUMP signed-offset rendering in:\n{out}"
865 );
866 assert!(
867 out.contains("CALL #1(callee)/2"),
868 "missing CALL line in:\n{out}"
869 );
870 assert!(
871 out.contains("MAKE_ARRAY 3"),
872 "missing MAKE_ARRAY line in:\n{out}"
873 );
874 assert!(
875 out.contains("MAKE_ENUM_VARIANT #0(Shape::Circle)/1"),
876 "missing MAKE_ENUM_VARIANT line in:\n{out}"
877 );
878 assert!(
879 out.contains("FIELD #2"),
880 "missing FIELD line in:\n{out}"
881 );
882 assert!(
883 out.contains("CONCAT_N 4"),
884 "missing CONCAT_N line in:\n{out}"
885 );
886 assert!(
887 out.contains("MATCH_VARIANT #0(Shape::Circle) miss-> "),
888 "missing MATCH_VARIANT line in:\n{out}"
889 );
890 }
891
892 // Test 15b: MAKE_STRUCT renders its u16 operand as `#id(StructName)` by
893 // looking the id up in program.structs; a bad id falls back to `?`.
894 #[test]
895 fn disassemble_renders_make_struct_with_its_struct_name() {
896 let mut p = Program::new();
897 p.fn_names.push("main".to_string());
898 // struct id 0 -> "Point" with two fields.
899 p.structs.push(StructInfo {
900 name: "Point".to_string(),
901 field_count: 2,
902 });
903
904 let mut c = Chunk::new();
905 // MAKE_STRUCT #0 -- the registered struct.
906 c.write_op(Opcode::MakeStruct, 1);
907 c.write_u16(0, 1);
908 // MAKE_STRUCT #9 -- an unregistered id, renders the `?` fallback.
909 c.write_op(Opcode::MakeStruct, 2);
910 c.write_u16(9, 2);
911
912 p.chunks.push(c.clone());
913 let out = c.disassemble(&p);
914 assert!(
915 out.contains("MAKE_STRUCT #0(Point)"),
916 "missing MAKE_STRUCT struct-name rendering in:\n{out}"
917 );
918 assert!(
919 out.contains("MAKE_STRUCT #9(?)"),
920 "missing MAKE_STRUCT bad-id fallback in:\n{out}"
921 );
922 }
923
924 // Test 16: disassemble is byte-deterministic across runs.
925 #[test]
926 fn disassemble_is_byte_identical_when_called_twice_on_the_same_chunk() {
927 // build a non-trivial chunk covering 5+ opcode kinds and a few
928 // constants.
929 let mut p = Program::new();
930 p.fn_names.push("main".to_string());
931 p.fn_names.push("helper".to_string());
932
933 let mut c = Chunk::new();
934 let i0 = c.add_constant(ConstValue::I64(1));
935 let i1 = c.add_constant(ConstValue::Str("hi".to_string()));
936 let i2 = c.add_constant(ConstValue::F64(2.5));
937
938 c.write_op(Opcode::Const, 1);
939 c.write_u16(i0, 1);
940 c.write_op(Opcode::Const, 1);
941 c.write_u16(i1, 1);
942 c.write_op(Opcode::Const, 1);
943 c.write_u16(i2, 1);
944 c.write_op(Opcode::Add, 2);
945 c.write_op(Opcode::Pop, 2);
946 c.write_op(Opcode::GetLocal, 3);
947 c.write_u16(0, 3);
948 c.write_op(Opcode::Call, 4);
949 c.write_u16(1, 4);
950 c.code.push(1);
951 c.source_lines.push(4);
952 c.write_op(Opcode::Return, 5);
953
954 p.chunks.push(c.clone());
955
956 let first = c.disassemble(&p);
957 let second = c.disassemble(&p);
958 assert_eq!(first, second, "disassembler is non-deterministic");
959 // sanity: the output spans multiple instructions.
960 assert!(first.lines().count() >= 8);
961 }
962
963 // Test 17: disassemble produces exactly one line per opcode in the
964 // chunk -- no skipping, no extra lines.
965 #[test]
966 fn disassemble_produces_one_line_per_opcode_for_every_operand_width() {
967 let mut p = Program::new();
968 p.fn_names.push("main".to_string());
969 p.fn_names.push("f".to_string());
970 p.enum_variant_names
971 .push(("E".to_string(), "A".to_string()));
972
973 let mut c = Chunk::new();
974 // zero-operand
975 c.write_op(Opcode::Pop, 1);
976 // two-operand (u16)
977 c.write_op(Opcode::Const, 2);
978 c.write_u16(0, 2);
979 // three-operand (u16 + u8)
980 c.write_op(Opcode::Call, 3);
981 c.write_u16(1, 3);
982 c.code.push(0);
983 c.source_lines.push(3);
984 // four-operand (u16 + i16)
985 c.write_op(Opcode::MatchVariant, 4);
986 c.write_u16(0, 4);
987 c.write_i16(0, 4);
988
989 let _ = c.add_constant(ConstValue::I64(0));
990 p.chunks.push(c.clone());
991
992 let out = c.disassemble(&p);
993 // 4 instructions -> 4 lines (newline-terminated).
994 assert_eq!(out.matches('\n').count(), 4, "wrong line count in:\n{out}");
995 }
996
997 // Test 18: Program::optimize runs the peephole pass over every chunk --
998 // both chunks have their CONST+POP pairs removed in place.
999 #[test]
1000 fn program_optimize_runs_peephole_over_every_chunk() {
1001 let mut p = Program::new();
1002 p.fn_names.push("main".to_string());
1003 p.fn_names.push("helper".to_string());
1004
1005 let mut a = Chunk::new();
1006 let idx_a = a.add_constant(ConstValue::I64(0));
1007 a.write_op(Opcode::Const, 1);
1008 a.write_u16(idx_a, 1);
1009 a.write_op(Opcode::Pop, 1);
1010 a.write_op(Opcode::Return, 1);
1011
1012 let mut b = Chunk::new();
1013 let idx_b = b.add_constant(ConstValue::I64(1));
1014 b.write_op(Opcode::Const, 1);
1015 b.write_u16(idx_b, 1);
1016 b.write_op(Opcode::Pop, 1);
1017 b.write_op(Opcode::Return, 1);
1018
1019 p.chunks.push(a);
1020 p.chunks.push(b);
1021
1022 p.optimize();
1023
1024 // the CONST+POP pair is gone from each chunk; only RETURN remains.
1025 assert_eq!(p.chunks[0].code, vec![Opcode::Return as u8]);
1026 assert_eq!(p.chunks[1].code, vec![Opcode::Return as u8]);
1027 // the lockstep invariant survives the whole-program pass.
1028 assert_eq!(p.chunks[0].code.len(), p.chunks[0].source_lines.len());
1029 assert_eq!(p.chunks[1].code.len(), p.chunks[1].source_lines.len());
1030 }
1031
1032 // Test 18b: Program::optimize on an empty program is a no-op (no panic).
1033 #[test]
1034 fn program_optimize_on_an_empty_program_does_nothing() {
1035 let mut p = Program::new();
1036 p.optimize();
1037 assert!(p.chunks.is_empty());
1038 }
1039
1040 // Test 18c: Program::optimize is idempotent -- a second call changes
1041 // nothing because the first pass already collapsed every window.
1042 #[test]
1043 fn program_optimize_is_idempotent() {
1044 let mut p = Program::new();
1045 p.fn_names.push("main".to_string());
1046
1047 let mut a = Chunk::new();
1048 let idx = a.add_constant(ConstValue::I64(0));
1049 a.write_op(Opcode::Const, 1);
1050 a.write_u16(idx, 1);
1051 a.write_op(Opcode::Pop, 1);
1052 a.write_op(Opcode::Dup, 2);
1053 a.write_op(Opcode::Pop, 2);
1054 a.write_op(Opcode::Return, 3);
1055 p.chunks.push(a);
1056
1057 p.optimize();
1058 let after_first = p.chunks[0].code.clone();
1059 p.optimize();
1060 assert_eq!(
1061 p.chunks[0].code, after_first,
1062 "second optimize changed bytes"
1063 );
1064 }
1065
1066 // bonus: Program::disassemble header + body composition is determ.
1067 #[test]
1068 fn program_disassemble_emits_a_header_per_chunk_and_concatenates_bodies() {
1069 let mut p = Program::new();
1070 p.fn_names.push("main".to_string());
1071 p.fn_names.push("helper".to_string());
1072
1073 let mut a = chunk_with_const(ConstValue::I64(0));
1074 a.write_op(Opcode::Return, 1);
1075 let mut b = chunk_with_const(ConstValue::I64(1));
1076 b.write_op(Opcode::Return, 1);
1077
1078 p.chunks.push(a);
1079 p.chunks.push(b);
1080
1081 let out = p.disassemble();
1082 assert!(out.contains("== chunk: main (fn_id 0) =="));
1083 assert!(out.contains("== chunk: helper (fn_id 1) =="));
1084 // the helper's "RETURN" line follows the main's body.
1085 let main_idx = out.find("== chunk: main").unwrap();
1086 let helper_idx = out.find("== chunk: helper").unwrap();
1087 assert!(main_idx < helper_idx);
1088 // determinism: calling twice produces identical output.
1089 assert_eq!(p.disassemble(), p.disassemble());
1090 }
1091}