qala-compiler 0.1.1

Compiler and bytecode VM for the Qala programming language
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
//! the per-function stack-frame layout for the ARM64 backend.
//!
//! [`FrameLayout`] holds, for one function, the map from a parameter name to
//! its `[fp, N]` stack slot, an indexed list of slots for the body's `let`
//! bindings and `for`-loop slots, the count of scratch spill slots the
//! expression codegen needs, and the derived `alloc` / `dealloc` frame sizes.
//! [`FrameLayout::plan_frame`] builds it from a [`TypedFnDecl`] in one pass
//! before the prologue is emitted.
//!
//! the slot model is deliberately simple -- correctness over density. every
//! parameter, every `let`, and every scratch spill slot occupies a full 8-byte
//! slot (an `i64` and a `bool` both take 8 bytes; no packing).
//!
//! ## binding slots: indexed by walk order, not by name
//!
//! a `let` may shadow an outer `let` of the same name, so a name-keyed map
//! cannot hold every binding's slot -- two `let x` would collide. instead
//! [`plan_frame`] walks the body in a fixed source order and assigns binding
//! occurrence `0, 1, 2, ...` the next slot, recording them in a `Vec` indexed
//! by that occurrence number. `stmt.rs` walks the body in the identical order,
//! keeping its own occurrence counter, and reads
//! [`FrameLayout::binding_slot`] by index. the deterministic walk order is the
//! shared contract between the two files: a parameter slot is named, a `let`
//! or `for` slot is positional. NO SLOT REUSE -- every binding occurrence,
//! including a shadowing `let` and a `for`-loop variable, gets its own
//! distinct slot.
//!
//! ## call arguments: spilled to claimed scratch slots, never a shared run
//!
//! a `TypedExpr::Call` evaluates each argument into `x0` one at a time, then
//! loads `x0`-`x{n-1}` from somewhere stable for the `bl` -- because every
//! `compile_expr` lands its result in `x0`, evaluating arg 1 would clobber
//! arg 0. each evaluated argument is `str`-ed to a freshly CLAIMED scratch
//! slot -- the same [`claim_scratch`](FrameLayout::claim_scratch) /
//! [`release_scratch`](FrameLayout::release_scratch) stack the nested-binary
//! spill discipline uses -- and all of them are released only after the `bl`.
//! a nested call inside one argument therefore claims slots STRICTLY BEYOND
//! the outer call's already-claimed argument slots, so no two live argument
//! values ever alias the same slot. there is no dedicated, shared argument
//! region: an argument slot is just another scratch slot, and the scratch
//! reservation in [`plan_frame`] is sized to cover it (see [`max_spill_depth`]
//! and its [`TypedExpr::Call`] arm).
//!
//! ## frame geometry: positive offsets from `fp`
//!
//! the prologue is `stp fp, lr, [sp, alloc]!` then `mov fp, sp`, so `fp` ends
//! up at the *bottom* of the allocated frame. the saved `fp`/`lr` pair sits at
//! `[fp, 0]` and `[fp, 8]`; every local slot is therefore at a *positive*
//! offset of 16 or more -- above the saved pair, inside the frame. parameter
//! `i` is at `[fp, 16 + 8*i]`. this matches the CPSC 355 reference
//! `Week 8/example1_scores.asm`, which accesses its locals as
//! `[fp, score1_s]` with `score1_s = 16`. (a negative `[fp, -N]` offset would
//! address *below* `sp`, outside the frame -- a stack-corruption bug.) the
//! slot regions in `[fp, N]` order: saved pair, parameters, binding slots,
//! expression scratch slots (which also hold a call's spilled arguments).
//!
//! ## the load-bearing detail: 16-byte stack alignment
//!
//! AArch64 faults on a misaligned `sp` at any memory access or `bl`. the frame
//! size must keep `sp` 16-byte aligned. the formula is
//! `alloc = -(16 + locals) & -16`: `16` is the saved `fp`/`lr` pair, `locals`
//! is the total stack-slot bytes, and `& -16` rounds the magnitude up to a
//! 16-byte multiple (`-16` is `...11110000` in two's complement, so the mask
//! clears the low four bits). the backend never touches `sp` between the
//! prologue and the epilogue -- locals are `fp`-relative -- so alignment
//! correctness reduces entirely to this one formula. [`compute_alloc`] is the
//! single implementation; its unit test asserts the four hand-computed values.

use crate::typed_ast::{
    TypedBlock, TypedElseBranch, TypedExpr, TypedFnDecl, TypedInterpPart, TypedStmt,
};
use std::collections::{HashMap, HashSet};

/// the byte size of one stack slot. an `i64` and a `bool` both occupy a full
/// 8-byte slot -- the integer core does not pack.
const SLOT_SIZE: i64 = 8;

/// the saved `fp`/`lr` pair size: two 8-byte registers, always 16 bytes and
/// always 16-byte aligned on its own. it occupies `[fp, 0]` and `[fp, 8]`, so
/// it is also the byte offset of the first local slot -- locals start at
/// `[fp, 16]`, just above the saved pair.
const FP_LR_PAIR: i64 = 16;

/// compute the AAPCS64 frame-allocation offset for a function with `locals`
/// bytes of stack-slot space.
///
/// returns the (negative) `alloc` used in `stp fp, lr, [sp, alloc]!`. the
/// formula is `-(16 + locals) & -16` -- the `fp`/`lr` pair plus the locals,
/// the magnitude rounded up to the next 16-byte multiple. `dealloc` (the
/// epilogue's positive add-back) is simply `-alloc`.
///
/// signed `i64` arithmetic is required: the `& -16` mask relies on `-16`'s
/// two's-complement bit pattern.
fn compute_alloc(locals: i64) -> i64 {
    -(FP_LR_PAIR + locals) & -16
}

/// the stack-frame layout of one function: the parameter name-to-slot map,
/// the body's binding slots, the scratch spill-slot reservation, and the
/// derived `alloc` / `dealloc` sizes.
///
/// crate-private: the layout is an internal detail of the `arm64` module.
pub(crate) struct FrameLayout {
    /// map from a parameter name to its stack slot -- the positive byte offset
    /// `N` in `[fp, N]`. parameters are name-keyed because a parameter name is
    /// unique within a signature; body `let`/`for` slots are positional (see
    /// [`binding_slots`](FrameLayout::binding_slots)).
    slots: HashMap<String, i64>,
    /// the stack slots for the body's `let` bindings and `for`-loop slots, in
    /// the deterministic walk order [`plan_frame`] visits them. binding
    /// occurrence `k` -- the `k`-th `let`, `for`-variable, or `for`-end slot
    /// the walk reaches -- is at `binding_slots[k]`. `stmt.rs` re-walks in
    /// lockstep and indexes this by occurrence number, which survives a
    /// shadowing `let` that a name-keyed map could not.
    binding_slots: Vec<i64>,
    /// the number of scratch spill slots reserved for the expression codegen.
    /// this covers both the nested-binary-operator spill discipline and a
    /// call's spilled arguments -- a call argument is `str`-ed to a claimed
    /// scratch slot, so the reservation must account for it (see
    /// [`max_spill_depth`]). their offsets sit immediately past the named and
    /// binding slots.
    #[allow(dead_code)]
    scratch_count: i64,
    /// the byte offset of the first scratch spill slot: `[fp, 16]` past the
    /// saved pair, plus the named and binding slots, is where the scratch
    /// stack begins.
    scratch_base: i64,
    /// how many scratch slots are currently claimed. the expression codegen
    /// claims on entering a binary op and releases on leaving; this never
    /// exceeds `scratch_count`.
    scratch_in_use: i64,
    /// the function's `alloc` offset (negative) for the prologue `stp`.
    alloc: i64,
}

impl FrameLayout {
    /// build the frame layout for one function: assign every parameter a slot,
    /// assign every body `let` binding and `for`-loop slot a distinct slot,
    /// reserve scratch spill slots for the body's deepest expression nesting,
    /// and compute `alloc` / `dealloc`.
    ///
    /// the slot regions, in `[fp, N]` order: the `FP_LR_PAIR` saved-register
    /// pair occupies `[fp, 0]` and `[fp, 8]`; then the parameters at
    /// `[fp, 16 + 8*i]`; then the body's binding slots (every `let` and, per
    /// `for`, the loop variable and a dedicated `end`-bound slot) in walk
    /// order; then the expression scratch slots.
    ///
    /// the binding slots are assigned by [`count_binding_slots`], a recursive
    /// walk over `decl.body` that descends into nested `if`/`while`/`for`
    /// blocks. no slot is reused -- a shadowing `let` and every `for` variable
    /// get their own distinct slot. the scratch reservation is derived from a
    /// pre-walk: [`max_spill_depth`] returns the peak number of scratch slots
    /// simultaneously live -- a binary operator's spilled LHS values and a
    /// call's spilled arguments together.
    ///
    /// `fn_names` is the set of user-declared function names: a call to a
    /// `print` / `println` name that the user has shadowed with their own
    /// function is sized as an ordinary user call, not as a `printf` lowering,
    /// so the spill pre-walk needs that set to match the emitter's routing.
    pub(crate) fn plan_frame(decl: &TypedFnDecl, fn_names: &HashSet<String>) -> FrameLayout {
        let mut slots = HashMap::new();
        // parameter i lives at [fp, 16 + 8*i]: the first param at [fp, 16],
        // the second at [fp, 24], and so on -- just above the saved fp/lr pair.
        for (i, param) in decl.params.iter().enumerate() {
            let slot = FP_LR_PAIR + SLOT_SIZE * i as i64;
            slots.insert(param.name.clone(), slot);
        }
        let param_count = decl.params.len() as i64;
        // the body's binding slots begin immediately past the parameter slots:
        // a `let` and, per `for`, two slots (the loop variable plus a dedicated
        // `end`-bound slot the For arm evaluates `end` into once).
        let binding_base = FP_LR_PAIR + SLOT_SIZE * param_count;
        let binding_count = count_binding_slots(&decl.body);
        let binding_slots: Vec<i64> = (0..binding_count)
            .map(|k| binding_base + SLOT_SIZE * k)
            .collect();
        let named_slot_count = param_count + binding_count;
        // the expression scratch slots sit immediately past the binding slots.
        // they hold both a binary operator's spilled LHS and a call's spilled
        // arguments -- `max_spill_depth` accounts for both.
        let scratch_base = FP_LR_PAIR + SLOT_SIZE * named_slot_count;
        let scratch_count = max_spill_depth(&decl.body, fn_names);
        let locals = SLOT_SIZE * (named_slot_count + scratch_count);
        let alloc = compute_alloc(locals);
        FrameLayout {
            slots,
            binding_slots,
            scratch_count,
            scratch_base,
            scratch_in_use: 0,
            alloc,
        }
    }

    /// the stack slot of a named parameter, or `None` if the name is unknown.
    ///
    /// the returned value is the positive offset `N` for `[fp, N]`. an unknown
    /// name is a backend bug -- the caller turns a `None` into a `QalaError`
    /// rather than panicking. `let` and `for` bindings are NOT resolved here --
    /// they are positional; see [`binding_slot`](FrameLayout::binding_slot).
    pub(crate) fn slot_of(&self, name: &str) -> Option<i64> {
        self.slots.get(name).copied()
    }

    /// the stack slot of body binding occurrence `index` -- the `index`-th
    /// `let`, `for`-variable, or `for`-end slot in [`plan_frame`]'s walk order.
    ///
    /// `stmt.rs` walks the body in the same order [`plan_frame`] used, keeping
    /// its own occurrence counter, and calls this with the next index when it
    /// reaches a `let` or `for`. an out-of-range index is a backend bug -- a
    /// walk-order mismatch between `plan_frame` and `stmt.rs` -- and the caller
    /// turns the `None` into a `QalaError` rather than panicking.
    pub(crate) fn binding_slot(&self, index: usize) -> Option<i64> {
        self.binding_slots.get(index).copied()
    }

    /// claim the next scratch spill slot, returning its `[fp, N]` offset.
    ///
    /// the expression codegen calls this on entering a binary operator (to hold
    /// the evaluated LHS while the RHS subtree runs), and once per argument of
    /// a call (to hold each evaluated argument until the `bl`); the matching
    /// [`release_scratch`] returns the slot. claims stack: nested binary ops
    /// and a nested call each get distinct slots strictly beyond the slots
    /// already claimed, so no two simultaneously-live values ever alias. the
    /// reservation from [`plan_frame`] -- via [`max_spill_depth`], which counts
    /// a call's argument spills -- guarantees a slot is always available.
    pub(crate) fn claim_scratch(&mut self) -> i64 {
        // the slot offset: past the named slots, indexed by how many scratch
        // slots are already in use.
        let offset = self.scratch_base + SLOT_SIZE * self.scratch_in_use;
        self.scratch_in_use += 1;
        offset
    }

    /// release the most recently claimed scratch spill slot.
    ///
    /// the inverse of [`claim_scratch`]; the two must be balanced so the
    /// scratch stack returns to empty at the end of every expression.
    pub(crate) fn release_scratch(&mut self) {
        debug_assert!(
            self.scratch_in_use > 0,
            "release_scratch with no slot claimed"
        );
        self.scratch_in_use -= 1;
    }

    /// the prologue instruction text: save `fp`/`lr` and set the frame pointer.
    ///
    /// emits the pre-indexed `stp fp, lr, [sp, <alloc>]!` (allocate the frame
    /// and store the pair) followed by `mov fp, sp`. `alloc` is the computed
    /// numeric literal, not an m4 symbol, so the emitted text is self-contained.
    pub(crate) fn prologue(&self) -> [String; 2] {
        [
            format!("stp     fp, lr, [sp, {}]!", self.alloc),
            "mov     fp, sp".to_string(),
        ]
    }

    /// the epilogue instruction text: restore `fp`/`lr` and return.
    ///
    /// emits the post-indexed `ldp fp, lr, [sp], <dealloc>` (load the pair and
    /// free the frame) followed by `ret`. `dealloc` is `-alloc`, the positive
    /// frame magnitude.
    pub(crate) fn epilogue(&self) -> [String; 2] {
        [
            format!("ldp     fp, lr, [sp], {}", -self.alloc),
            "ret".to_string(),
        ]
    }

    /// the number of scratch spill slots this frame reserves for the
    /// expression codegen's nested-binary-operator spill discipline.
    ///
    /// plan 12-02 consults this when it extends `plan_frame` to also count
    /// `let`-binding slots -- the scratch reservation must survive that
    /// extension. tests assert it against known expression shapes.
    #[allow(dead_code)]
    pub(crate) fn scratch_count(&self) -> i64 {
        self.scratch_count
    }
}

/// the count of body binding slots a function needs: one slot per `let`
/// binding, two slots per `for` loop (the loop variable and a dedicated
/// `end`-bound slot the For arm evaluates `end` into once).
///
/// THE WALK ORDER is the contract between this function and `stmt.rs`. both
/// descend `TypedBlock.stmts` in source order, into `If.then_block` then the
/// `else` branch, into `While.body`, and into `For.body`. a `for` reserves its
/// variable slot first, then its `end` slot, then descends into the body --
/// `stmt.rs` claims the slots in that same order. as long as the two walks
/// match, occurrence `k` here is occurrence `k` there. the `for`-variable slot
/// is registered into the body's scope by `stmt.rs`, so an `Ident` reference
/// to the loop variable resolves to it.
fn count_binding_slots(block: &TypedBlock) -> i64 {
    let mut count = 0;
    for stmt in &block.stmts {
        count += stmt_binding_slots(stmt);
    }
    count
}

/// the binding-slot count contributed by one statement (see
/// [`count_binding_slots`] for the walk-order contract).
fn stmt_binding_slots(stmt: &TypedStmt) -> i64 {
    match stmt {
        // a `let` is one slot; its initializer holds no further bindings of
        // its own (an initializer is an expression, and expressions carry no
        // `let`/`for` -- a block expression's statements would, but the integer
        // core's `Block` expression is reached only as a trailing value, which
        // this walk already covers via the body it belongs to).
        TypedStmt::Let { .. } => 1,
        TypedStmt::If {
            then_block,
            else_branch,
            ..
        } => {
            let mut c = count_binding_slots(then_block);
            if let Some(branch) = else_branch {
                c += else_branch_binding_slots(branch);
            }
            c
        }
        TypedStmt::While { body, .. } => count_binding_slots(body),
        // a `for` is two slots -- the loop variable and the `end` bound --
        // counted BEFORE descending into the body, matching `stmt.rs`.
        TypedStmt::For { body, .. } => 2 + count_binding_slots(body),
        // these statements introduce no binding slot.
        TypedStmt::Return { .. }
        | TypedStmt::Break { .. }
        | TypedStmt::Continue { .. }
        | TypedStmt::Defer { .. }
        | TypedStmt::Expr { .. } => 0,
    }
}

/// the binding-slot count within an `else` branch (a block or an `else if`).
fn else_branch_binding_slots(branch: &TypedElseBranch) -> i64 {
    match branch {
        TypedElseBranch::Block(b) => count_binding_slots(b),
        TypedElseBranch::If(stmt) => stmt_binding_slots(stmt),
    }
}

/// the peak number of scratch slots simultaneously live while evaluating
/// `block`.
///
/// a scratch slot is claimed in two situations, and this pre-walk counts both
/// so [`plan_frame`] reserves enough for the deepest case:
///
/// - an arithmetic or comparison `Binary` node spills its evaluated LHS to a
///   scratch slot while it evaluates the RHS subtree. a nested operator spills
///   again, so a chain accumulates. short-circuit `&&` / `||` do NOT spill --
///   they branch on `x0` directly and reload no LHS.
/// - a `Call` spills each evaluated argument to a claimed scratch slot and
///   holds every one of them until the `bl`. its `n` arguments therefore
///   occupy `n` simultaneously-live slots, and evaluating any one argument can
///   itself need scratch slots on top.
///
/// the contract with the expression codegen: every slot the codegen claims is
/// counted here. a single shared argument run would NOT be safe (a nested call
/// would alias the outer call's saved arguments), which is why a call argument
/// is a claimed scratch slot like any other.
fn max_spill_depth(block: &TypedBlock, fn_names: &HashSet<String>) -> i64 {
    let mut max = 0;
    for stmt in &block.stmts {
        max = max.max(stmt_spill_depth(stmt, fn_names));
    }
    if let Some(value) = &block.value {
        max = max.max(expr_spill_depth(value, fn_names));
    }
    max
}

/// the peak simultaneously-live scratch-slot count within one statement.
///
/// every statement variant contributes the depth of its sub-expressions, so
/// the count covers a call or a binary operator wherever one appears -- a
/// `let` initializer, an `if` condition, a `for` range bound, a `return`
/// value, a bare expression statement.
fn stmt_spill_depth(stmt: &TypedStmt, fn_names: &HashSet<String>) -> i64 {
    match stmt {
        TypedStmt::Expr { expr, .. } => expr_spill_depth(expr, fn_names),
        TypedStmt::Return { value, .. } => value
            .as_ref()
            .map(|v| expr_spill_depth(v, fn_names))
            .unwrap_or(0),
        TypedStmt::Let { init, .. } => expr_spill_depth(init, fn_names),
        TypedStmt::If {
            cond,
            then_block,
            else_branch,
            ..
        } => {
            let mut d = expr_spill_depth(cond, fn_names).max(max_spill_depth(then_block, fn_names));
            if let Some(branch) = else_branch {
                d = d.max(else_branch_spill_depth(branch, fn_names));
            }
            d
        }
        TypedStmt::While { cond, body, .. } => {
            expr_spill_depth(cond, fn_names).max(max_spill_depth(body, fn_names))
        }
        TypedStmt::For { iter, body, .. } => {
            expr_spill_depth(iter, fn_names).max(max_spill_depth(body, fn_names))
        }
        TypedStmt::Defer { expr, .. } => expr_spill_depth(expr, fn_names),
        TypedStmt::Break { .. } | TypedStmt::Continue { .. } => 0,
    }
}

/// the maximum spill count within an `else` branch (a block or an `else if`).
fn else_branch_spill_depth(branch: &TypedElseBranch, fn_names: &HashSet<String>) -> i64 {
    match branch {
        TypedElseBranch::Block(b) => max_spill_depth(b, fn_names),
        TypedElseBranch::If(stmt) => stmt_spill_depth(stmt, fn_names),
    }
}

/// the peak number of scratch spill slots simultaneously live while
/// evaluating `expr`.
///
/// an arithmetic or comparison `Binary` node claims one scratch slot and holds
/// it for the *entire* evaluation of both operands -- the codegen claims before
/// the LHS runs and releases only after the RHS reload -- so its peak live-slot
/// count is `1 + max(spill_depth(lhs), spill_depth(rhs))`. a short-circuit
/// `&&` / `||` claims no slot of its own (it branches on `x0` and reloads no
/// LHS), so its peak is `max(spill_depth(lhs), spill_depth(rhs))`.
///
/// a `Call` with `n` arguments spills each evaluated argument to a claimed
/// scratch slot and holds all `n` until the `bl` -- so `n` slots are live at
/// once -- and evaluating any single argument can claim further slots on top.
/// the codegen evaluates an argument fully before claiming its slot, so while
/// argument `k` runs at most `k` sibling slots are already held; the peak is
/// therefore bounded by `n + max(depth of each argument subtree)`. that bound
/// (the callee subtree is included for completeness, though a supported callee
/// is always a bare identifier of depth 0) is what this arm returns, matching
/// the review's `args.len() + max(arg subtree depth)` model. non-call,
/// non-binary nodes recurse into their children with no added slot.
fn expr_spill_depth(expr: &TypedExpr, fn_names: &HashSet<String>) -> i64 {
    use crate::ast::BinOp;
    match expr {
        TypedExpr::Binary { op, lhs, rhs, .. } => {
            let operands = expr_spill_depth(lhs, fn_names).max(expr_spill_depth(rhs, fn_names));
            match op {
                // && and || branch on x0 -- no LHS reload, no scratch slot.
                BinOp::And | BinOp::Or => operands,
                // arithmetic and comparison hold one slot across BOTH operands.
                _ => 1 + operands,
            }
        }
        // a `print` / `println` call lowers to a `printf` call, not a normal
        // `bl`: its scratch need is one slot per i64 interpolation hole, not
        // one per argument -- the single `str` argument is itself walked into
        // the holes. `printf_call_spill_depth` computes that; a non-printf
        // call falls through to the general arity-plus-depth rule below.
        TypedExpr::Call { callee, args, .. } if is_print_callee(callee, fn_names) => {
            printf_call_spill_depth(args, fn_names)
        }
        // a call holds one scratch slot per argument simultaneously (the
        // spilled argument values, live until the `bl`), plus the deepest
        // single argument's own scratch need on top. the callee subtree is
        // folded into the same max -- harmless, since a supported callee is a
        // bare identifier. a single shared argument run is NOT used: it would
        // let a nested call clobber the outer call's saved arguments.
        TypedExpr::Call { callee, args, .. } => {
            let mut deepest = expr_spill_depth(callee, fn_names);
            for arg in args {
                deepest = deepest.max(expr_spill_depth(arg, fn_names));
            }
            args.len() as i64 + deepest
        }
        TypedExpr::Unary { operand, .. } => expr_spill_depth(operand, fn_names),
        TypedExpr::Paren { inner, .. } => expr_spill_depth(inner, fn_names),
        TypedExpr::Block { block, .. } => max_spill_depth(block, fn_names),
        TypedExpr::Range { start, end, .. } => {
            let s = start
                .as_deref()
                .map(|e| expr_spill_depth(e, fn_names))
                .unwrap_or(0);
            let e = end
                .as_deref()
                .map(|e| expr_spill_depth(e, fn_names))
                .unwrap_or(0);
            s.max(e)
        }
        // a leaf or an unsupported construct contributes no spill.
        _ => 0,
    }
}

/// whether a call's callee is the `print` or `println` output built-in.
///
/// the `print` / `println` lowering ([`super::print`]) spills one scratch
/// slot per `i64` interpolation hole rather than one per call argument, so the
/// frame planner counts a printf call differently from a user call. a callee
/// is a printf callee when it is a bare identifier named `print` or `println`
/// AND that name is not a user-declared function -- a program may declare its
/// own `fn println`, which shadows the built-in and is sized as an ordinary
/// user call. this matches the routing in `expr.rs::compile_call`: the two
/// must agree, or the planned frame would not fit the emitted body.
fn is_print_callee(callee: &TypedExpr, fn_names: &HashSet<String>) -> bool {
    matches!(
        callee,
        TypedExpr::Ident { name, .. }
            if (name == "print" || name == "println") && !fn_names.contains(name)
    )
}

/// the peak scratch-slot count a `print` / `println` call needs.
///
/// the call lowers to a `printf`: each `i64` interpolation hole is evaluated
/// into `x0` and spilled to a claimed scratch slot, and every spilled hole is
/// held until `bl printf`. so `n` holes occupy `n` simultaneously-live slots,
/// and evaluating any single hole can claim further slots on top -- the same
/// `args.len() + max(arg depth)` bound the general `Call` arm uses, but
/// counted over the interpolation's holes instead of the call's arguments. a
/// string-literal argument has no holes and needs no scratch; a non-literal,
/// non-interpolation argument is rejected by the emitter, so a `0` here is
/// safe -- the frame for a program the backend rejects is never emitted.
fn printf_call_spill_depth(args: &[TypedExpr], fn_names: &HashSet<String>) -> i64 {
    // a well-typed print / println has exactly one str argument; walk it for
    // the interpolation holes. anything else contributes no scratch.
    let Some(TypedExpr::Interpolation { parts, .. }) = args.first() else {
        return 0;
    };
    let mut hole_count = 0;
    let mut deepest = 0;
    for part in parts {
        if let TypedInterpPart::Expr(expr) = part {
            hole_count += 1;
            deepest = deepest.max(expr_spill_depth(expr, fn_names));
        }
    }
    hole_count + deepest
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::ast::BinOp;
    use crate::effects::EffectSet;
    use crate::span::Span;
    use crate::types::QalaType;

    fn span(n: usize) -> Span {
        Span::new(n, n + 1)
    }

    fn int(value: i64) -> TypedExpr {
        TypedExpr::Int {
            value,
            ty: QalaType::I64,
            span: span(0),
        }
    }

    /// build a TypedFnDecl with the given parameter count and body.
    fn fn_decl(param_count: usize, body: crate::typed_ast::TypedBlock) -> TypedFnDecl {
        let params = (0..param_count)
            .map(|i| crate::typed_ast::TypedParam {
                is_self: false,
                name: format!("p{i}"),
                ty: QalaType::I64,
                default: None,
                span: span(i),
            })
            .collect();
        TypedFnDecl {
            type_name: None,
            name: "f".to_string(),
            params,
            ret_ty: QalaType::I64,
            effect: EffectSet::pure(),
            body,
            span: span(0),
        }
    }

    /// an empty user-function-name set: the synthetic-decl tests make no
    /// calls, so no `print` / `println` shadowing can apply.
    fn no_fns() -> HashSet<String> {
        HashSet::new()
    }

    /// the set of every user-declared function name in `typed` -- what
    /// `Arm64Backend::scan_fn_names` builds, reproduced for the source-based
    /// frame tests so `plan_frame` sees the same shadowing context the
    /// compiler does.
    fn fn_names_of(typed: &crate::typed_ast::TypedAst) -> HashSet<String> {
        typed
            .iter()
            .filter_map(|item| match item {
                crate::typed_ast::TypedItem::Fn(d) => Some(d.name.clone()),
                _ => None,
            })
            .collect()
    }

    fn empty_body() -> crate::typed_ast::TypedBlock {
        crate::typed_ast::TypedBlock {
            stmts: vec![],
            value: None,
            ty: QalaType::Void,
            span: span(0),
        }
    }

    #[test]
    fn alloc_formula_matches_the_hand_computed_values() {
        // THE load-bearing test: alloc = -(16 + locals) & -16 for the four
        // hand-computed cases. these concrete values are independent of any
        // snapshot fixture -- a wrong formula fails here regardless.
        assert_eq!(compute_alloc(0), -16, "locals=0 must round to -16");
        assert_eq!(
            compute_alloc(8),
            -32,
            "locals=8 (24 bytes) must round up to -32"
        );
        assert_eq!(
            compute_alloc(16),
            -32,
            "locals=16 (32 bytes) must round to -32"
        );
        assert_eq!(
            compute_alloc(24),
            -48,
            "locals=24 (40 bytes) must round up to -48"
        );
    }

    #[test]
    fn alloc_is_always_a_negative_multiple_of_sixteen() {
        // every frame size, for a range of locals, keeps sp 16-byte aligned.
        for locals in (0..=512).step_by(8) {
            let a = compute_alloc(locals);
            assert!(a <= -16, "alloc must reserve at least the fp/lr pair");
            assert_eq!(
                a % 16,
                0,
                "alloc must be a multiple of 16 for locals={locals}"
            );
        }
    }

    #[test]
    fn a_leaf_function_with_no_params_has_a_minus_sixteen_frame() {
        // a 0-parameter function with an empty body: no named slots, no
        // scratch, locals = 0 -> alloc = -16, dealloc = 16. the concrete
        // prologue/epilogue text is asserted verbatim.
        let layout = FrameLayout::plan_frame(&fn_decl(0, empty_body()), &no_fns());
        assert_eq!(layout.alloc, -16);
        assert_eq!(layout.prologue()[0], "stp     fp, lr, [sp, -16]!");
        assert_eq!(layout.prologue()[1], "mov     fp, sp");
        assert_eq!(layout.epilogue()[0], "ldp     fp, lr, [sp], 16");
        assert_eq!(layout.epilogue()[1], "ret");
    }

    #[test]
    fn three_params_no_scratch_gives_a_minus_thirty_two_frame() {
        // a 3-parameter function with a body that needs no scratch (the
        // trailing value is a bare ident): 3 named slots = 24 bytes locals,
        // alloc = -(16+24) & -16 = -48. concrete value, independent of snapshots.
        let body = crate::typed_ast::TypedBlock {
            stmts: vec![],
            value: Some(Box::new(TypedExpr::Ident {
                name: "p0".to_string(),
                ty: QalaType::I64,
                span: span(0),
            })),
            ty: QalaType::I64,
            span: span(0),
        };
        let layout = FrameLayout::plan_frame(&fn_decl(3, body), &no_fns());
        assert_eq!(layout.scratch_count(), 0, "a bare ident needs no scratch");
        assert_eq!(layout.alloc, -48, "3 params (24 bytes) -> alloc -48");
    }

    #[test]
    fn parameter_slots_start_at_sixteen_above_the_saved_pair() {
        // parameter 0 -> [fp, 16], parameter 1 -> [fp, 24], parameter 2 ->
        // [fp, 32]: positive offsets just past the saved fp/lr pair.
        let layout = FrameLayout::plan_frame(&fn_decl(3, empty_body()), &no_fns());
        assert_eq!(layout.slot_of("p0"), Some(16));
        assert_eq!(layout.slot_of("p1"), Some(24));
        assert_eq!(layout.slot_of("p2"), Some(32));
        assert_eq!(layout.slot_of("missing"), None);
    }

    #[test]
    fn max_spill_depth_is_zero_for_a_leaf_expression() {
        // a bare literal trailing value: no binary op, no spill.
        let body = crate::typed_ast::TypedBlock {
            stmts: vec![],
            value: Some(Box::new(int(7))),
            ty: QalaType::I64,
            span: span(0),
        };
        assert_eq!(max_spill_depth(&body, &no_fns()), 0);
    }

    #[test]
    fn max_spill_depth_counts_a_single_binary_op_as_one() {
        // `1 + 2`: the LHS is spilled once while the RHS (a leaf) runs.
        let add = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(int(1)),
            rhs: Box::new(int(2)),
            ty: QalaType::I64,
            span: span(0),
        };
        let body = crate::typed_ast::TypedBlock {
            stmts: vec![],
            value: Some(Box::new(add)),
            ty: QalaType::I64,
            span: span(0),
        };
        assert_eq!(max_spill_depth(&body, &no_fns()), 1);
    }

    #[test]
    fn max_spill_depth_grows_with_right_leaning_nesting() {
        // `1 + (2 + (3 + 4))`: the outer op spills, its RHS is itself a binary
        // op that spills, and so on -- three simultaneously-live spills.
        let inner = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(int(3)),
            rhs: Box::new(int(4)),
            ty: QalaType::I64,
            span: span(0),
        };
        let mid = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(int(2)),
            rhs: Box::new(inner),
            ty: QalaType::I64,
            span: span(0),
        };
        let outer = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(int(1)),
            rhs: Box::new(mid),
            ty: QalaType::I64,
            span: span(0),
        };
        let body = crate::typed_ast::TypedBlock {
            stmts: vec![],
            value: Some(Box::new(outer)),
            ty: QalaType::I64,
            span: span(0),
        };
        assert_eq!(max_spill_depth(&body, &no_fns()), 3);
    }

    #[test]
    fn left_leaning_nesting_accumulates_spills() {
        // `((1 + 2) + 3) + 4`: each operator claims its scratch slot BEFORE
        // evaluating its LHS and holds it across the whole LHS subtree, so a
        // left-leaning chain accumulates -- three operators deep means three
        // simultaneously-live slots when the innermost `1 + 2` runs.
        let a = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(int(1)),
            rhs: Box::new(int(2)),
            ty: QalaType::I64,
            span: span(0),
        };
        let b = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(a),
            rhs: Box::new(int(3)),
            ty: QalaType::I64,
            span: span(0),
        };
        let c = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(b),
            rhs: Box::new(int(4)),
            ty: QalaType::I64,
            span: span(0),
        };
        let body = crate::typed_ast::TypedBlock {
            stmts: vec![],
            value: Some(Box::new(c)),
            ty: QalaType::I64,
            span: span(0),
        };
        assert_eq!(max_spill_depth(&body, &no_fns()), 3);
    }

    #[test]
    fn claim_and_release_scratch_balance() {
        // claim hands out distinct slots past the named slots; release unwinds.
        let mut layout = FrameLayout::plan_frame(&fn_decl(2, empty_body()), &no_fns());
        // 2 params -> scratch_base = 16 + 8*2 = 32; the scratch slots run
        // [fp, 32], [fp, 40], ...
        let first = layout.claim_scratch();
        let second = layout.claim_scratch();
        assert_eq!(first, 32);
        assert_eq!(second, 40);
        assert_ne!(first, second, "nested claims must be distinct slots");
        layout.release_scratch();
        layout.release_scratch();
        // after a balanced unwind a fresh claim reuses the first slot.
        assert_eq!(layout.claim_scratch(), 32);
    }

    #[test]
    fn scratch_slots_are_counted_into_the_frame() {
        // a body whose trailing value is `1 + 2` reserves one scratch slot;
        // with 1 param that is 2 slots = 16 bytes -> alloc = -32.
        let add = TypedExpr::Binary {
            op: BinOp::Add,
            lhs: Box::new(int(1)),
            rhs: Box::new(int(2)),
            ty: QalaType::I64,
            span: span(0),
        };
        let body = crate::typed_ast::TypedBlock {
            stmts: vec![],
            value: Some(Box::new(add)),
            ty: QalaType::I64,
            span: span(0),
        };
        let layout = FrameLayout::plan_frame(&fn_decl(1, body), &no_fns());
        assert_eq!(layout.scratch_count(), 1);
        // 1 param + 1 scratch = 2 slots = 16 bytes locals -> -(16+16)&-16 = -32.
        assert_eq!(layout.alloc, -32);
    }

    /// lex, parse, and typecheck `src`, returning the first function's decl.
    fn fn_from_source(src: &str) -> TypedFnDecl {
        let tokens = crate::lexer::Lexer::tokenize(src).expect("lex failed");
        let ast = crate::parser::Parser::parse(&tokens).expect("parse failed");
        let (typed, terrors, _) = crate::typechecker::check_program(&ast, src);
        assert!(terrors.is_empty(), "typecheck errors: {terrors:?}");
        typed
            .into_iter()
            .find_map(|item| match item {
                crate::typed_ast::TypedItem::Fn(d) => Some(d),
                _ => None,
            })
            .expect("no function in source")
    }

    #[test]
    fn the_arithmetic_snapshot_function_has_a_minus_sixty_four_frame() {
        // the exact program in the arithmetic snapshot test. its trailing value
        // `(a + b) * (a - b) / 2 % 7` is four operators deep on the left spine
        // -- %, /, *, + -- each holding a scratch slot across its LHS subtree,
        // so the peak live-slot count is 4. with 2 parameters that is 6 slots =
        // 48 bytes of locals -> alloc = -(16 + 48) & -16 = -64. this concrete
        // value is asserted independently of the snapshot fixture: a wrong
        // spill-depth count fails here regardless of what the fixture holds.
        let decl = fn_from_source("fn arith(a: i64, b: i64) -> i64 { (a + b) * (a - b) / 2 % 7 }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(
            layout.scratch_count(),
            4,
            "the left spine is 4 operators deep"
        );
        assert_eq!(
            layout.alloc, -64,
            "2 params + 4 scratch = 48 bytes -> alloc -64"
        );
        assert_eq!(layout.prologue()[0], "stp     fp, lr, [sp, -64]!");
        assert_eq!(layout.epilogue()[0], "ldp     fp, lr, [sp], 64");
    }

    #[test]
    fn the_comparison_snapshot_function_has_a_minus_forty_eight_frame() {
        // the comparison snapshot's `cmp` function: `a == b` is one operator,
        // one scratch slot. 2 params + 1 scratch = 3 slots = 24 bytes -> alloc
        // = -(16 + 24) & -16 = -48. a concrete value, not an inequality.
        let decl = fn_from_source("fn cmp(a: i64, b: i64) -> bool { a == b }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(
            layout.scratch_count(),
            1,
            "one comparison -> one scratch slot"
        );
        assert_eq!(
            layout.alloc, -48,
            "2 params + 1 scratch = 24 bytes -> alloc -48"
        );
    }

    #[test]
    fn the_boolean_snapshot_function_has_a_minus_thirty_two_frame() {
        // the boolean snapshot's `andor` function: `a && b || !a`. the `&&` and
        // `||` short-circuit -- they claim no scratch slot -- and `!a` is unary.
        // so the function reserves zero scratch slots. 2 params + 0 scratch = 2
        // slots = 16 bytes -> alloc = -(16 + 16) & -16 = -32.
        let decl = fn_from_source("fn andor(a: bool, b: bool) -> bool { a && b || !a }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(
            layout.scratch_count(),
            0,
            "short-circuit ops claim no scratch"
        );
        assert_eq!(
            layout.alloc, -32,
            "2 params + 0 scratch = 16 bytes -> alloc -32"
        );
    }

    #[test]
    fn a_function_with_two_lets_assigns_them_concrete_distinct_slots() {
        // `fn f() { let x = 1 let y = 2 }`: no params, two `let` bindings.
        // binding occurrence 0 (`x`) is at [fp, 16], occurrence 1 (`y`) at
        // [fp, 24] -- distinct, consecutive, just above the saved pair.
        let decl = fn_from_source("fn f() { let x = 1\nlet y = 2 }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(layout.binding_slot(0), Some(16), "first let -> [fp, 16]");
        assert_eq!(layout.binding_slot(1), Some(24), "second let -> [fp, 24]");
        assert_eq!(layout.binding_slot(2), None, "only two bindings exist");
        // 2 binding slots = 16 bytes locals, no params, no scratch ->
        // alloc = -(16 + 16) & -16 = -32. a concrete value, not an inequality.
        assert_eq!(layout.alloc, -32, "2 lets (16 bytes) -> alloc -32");
    }

    #[test]
    fn let_bindings_are_counted_after_the_parameters() {
        // `fn f(a: i64) { let x = 1 }`: one parameter then one `let`. the
        // parameter takes [fp, 16]; the `let` binding slot starts past it at
        // [fp, 24]. 1 param + 1 let = 2 slots = 16 bytes -> alloc = -32.
        let decl = fn_from_source("fn f(a: i64) { let x = 1 }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(layout.slot_of("a"), Some(16), "param a -> [fp, 16]");
        assert_eq!(layout.binding_slot(0), Some(24), "the let -> [fp, 24]");
        assert_eq!(layout.alloc, -32, "1 param + 1 let = 16 bytes -> alloc -32");
    }

    #[test]
    fn a_shadowing_let_gets_its_own_distinct_slot() {
        // two `let x` in the same body -- the inner shadows the outer. a
        // name-keyed map would collide; the positional binding list gives each
        // its own slot. occurrence 0 -> [fp, 16], occurrence 1 -> [fp, 24].
        let decl = fn_from_source("fn f() -> i64 { let x = 1\nlet x = 2\nx }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(layout.binding_slot(0), Some(16), "outer x -> [fp, 16]");
        assert_eq!(layout.binding_slot(1), Some(24), "shadowing x -> [fp, 24]");
        assert_ne!(
            layout.binding_slot(0),
            layout.binding_slot(1),
            "a shadowing let must not reuse the outer slot"
        );
    }

    #[test]
    fn lets_nested_inside_an_if_are_counted_into_the_frame() {
        // a `let` inside an `if` block still needs a distinct slot. the body
        // has one outer `let` then an `if` with a `let` in each arm: three
        // `let`s total. binding occurrences 0/1/2 at [fp, 16]/[fp, 24]/[fp, 32].
        let decl =
            fn_from_source("fn f(c: bool) { let a = 1\nif c { let b = 2 } else { let d = 3 } }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(
            layout.binding_slot(0),
            Some(24),
            "outer let a, past param c"
        );
        assert_eq!(layout.binding_slot(1), Some(32), "let b in the then-block");
        assert_eq!(layout.binding_slot(2), Some(40), "let d in the else-block");
        // 1 param + 3 lets = 4 slots = 32 bytes -> alloc = -(16 + 32) & -16 = -48.
        assert_eq!(layout.alloc, -48, "1 param + 3 nested lets -> alloc -48");
    }

    #[test]
    fn a_for_loop_reserves_two_slots_a_variable_and_an_end_bound() {
        // `fn f() { for i in 0..10 { } }`: a `for` reserves two binding slots
        // -- the loop variable `i` and a dedicated `end`-bound slot. they are
        // occurrences 0 and 1 at [fp, 16] and [fp, 24].
        let decl = fn_from_source("fn f() { for i in 0..10 { } }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(layout.binding_slot(0), Some(16), "for-variable slot");
        assert_eq!(layout.binding_slot(1), Some(24), "for-end-bound slot");
        assert_eq!(
            layout.binding_slot(2),
            None,
            "a bare for is exactly two slots"
        );
        // 2 for slots = 16 bytes, no params, no scratch -> alloc = -32.
        assert_eq!(layout.alloc, -32, "one for loop (16 bytes) -> alloc -32");
    }

    #[test]
    fn a_let_inside_a_for_body_is_counted_after_the_for_slots() {
        // `for` slots come first (var, end), then the body's `let`. the body
        // is `for i in 0..3 { let s = i }`: occurrences 0 (`i`), 1 (`end`),
        // 2 (`s`) at [fp, 16], [fp, 24], [fp, 32].
        let decl = fn_from_source("fn f() { for i in 0..3 { let s = i } }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(layout.binding_slot(0), Some(16), "for-variable i");
        assert_eq!(layout.binding_slot(1), Some(24), "for-end bound");
        assert_eq!(layout.binding_slot(2), Some(32), "the let inside the body");
        // 3 binding slots = 24 bytes -> alloc = -(16 + 24) & -16 = -48.
        assert_eq!(layout.alloc, -48, "for (2 slots) + nested let -> alloc -48");
    }

    #[test]
    fn a_function_with_no_calls_reserves_no_scratch() {
        // `fn f(a: i64) -> i64 { a }` makes no call and has no binary op -- its
        // trailing value is a bare ident, so the scratch reservation is zero
        // and the frame holds only the one parameter slot.
        let decl = fn_from_source("fn f(a: i64) -> i64 { a }");
        let layout = FrameLayout::plan_frame(&decl, &no_fns());
        assert_eq!(layout.scratch_count(), 0, "a bare ident needs no scratch");
        // 1 param + 0 scratch = 8 bytes -> alloc = -(16 + 8) & -16 = -32.
        assert_eq!(layout.alloc, -32, "1 param, no call -> alloc -32");
    }

    #[test]
    fn a_two_param_one_let_function_with_a_three_arg_call_has_concrete_slots() {
        // a function with 2 params, 1 `let`, and a 3-argument call. named
        // slots: 2 params at [fp, 16]/[fp, 24], the `let` at [fp, 32]. the
        // call spills its three arguments to three claimed scratch slots,
        // immediately past the named slots -- [fp, 40], [fp, 48], [fp, 56].
        // each call argument is a bare ident or literal (depth 0), so the
        // call's scratch need is exactly args.len() = 3. locals =
        // 8 * (3 named + 3 scratch) = 48 -> alloc = -(16 + 48) & -16 = -64.
        let src = "fn add3(a: i64, b: i64, c: i64) -> i64 { a }\n\
                   fn caller(x: i64, y: i64) -> i64 { let r = add3(x, y, 1)\nr }";
        let tokens = crate::lexer::Lexer::tokenize(src).expect("lex failed");
        let ast = crate::parser::Parser::parse(&tokens).expect("parse failed");
        let (typed, terrors, _) = crate::typechecker::check_program(&ast, src);
        assert!(terrors.is_empty(), "typecheck errors: {terrors:?}");
        let caller = typed
            .iter()
            .find_map(|item| match item {
                crate::typed_ast::TypedItem::Fn(d) if d.name == "caller" => Some(d.clone()),
                _ => None,
            })
            .expect("no caller function");
        let layout = FrameLayout::plan_frame(&caller, &fn_names_of(&typed));
        // 2 params + 1 let = 3 named slots at [fp, 16], [fp, 24], [fp, 32].
        assert_eq!(layout.slot_of("x"), Some(16), "param x -> [fp, 16]");
        assert_eq!(layout.slot_of("y"), Some(24), "param y -> [fp, 24]");
        assert_eq!(layout.binding_slot(0), Some(32), "let r -> [fp, 32]");
        // the 3-arg call needs three scratch slots, past the named slots.
        assert_eq!(
            layout.scratch_count(),
            3,
            "a 3-arg call -> three scratch slots"
        );
        // 3 named + 3 scratch = 48 bytes -> alloc = -64.
        assert_eq!(
            layout.alloc, -64,
            "2 params + 1 let + 3-arg call -> alloc -64"
        );
        assert_eq!(layout.prologue()[0], "stp     fp, lr, [sp, -64]!");
        assert_eq!(layout.epilogue()[0], "ldp     fp, lr, [sp], 64");
    }

    #[test]
    fn sibling_calls_in_separate_statements_size_to_the_widest_not_the_sum() {
        // a function with a 1-arg call and a 2-arg call in SEPARATE statements
        // reserves 2 scratch slots -- the widest single call, not 1 + 2. each
        // call claims its argument slots and releases them at its `bl`, so the
        // two calls' slots never overlap and the peak is the larger call.
        let src = "fn one(a: i64) -> i64 { a }\n\
                   fn two(a: i64, b: i64) -> i64 { a }\n\
                   fn f() { let p = one(1)\nlet q = two(2, 3) }";
        let tokens = crate::lexer::Lexer::tokenize(src).expect("lex failed");
        let ast = crate::parser::Parser::parse(&tokens).expect("parse failed");
        let (typed, terrors, _) = crate::typechecker::check_program(&ast, src);
        assert!(terrors.is_empty(), "typecheck errors: {terrors:?}");
        let f = typed
            .iter()
            .find_map(|item| match item {
                crate::typed_ast::TypedItem::Fn(d) if d.name == "f" => Some(d.clone()),
                _ => None,
            })
            .expect("no f function");
        let layout = FrameLayout::plan_frame(&f, &fn_names_of(&typed));
        // the 2-arg call is the widest -> two scratch slots, past the two lets.
        assert_eq!(layout.scratch_count(), 2, "the widest call passes two args");
        // 2 lets + 2 scratch = 32 bytes -> alloc = -(16 + 32) & -16 = -48.
        assert_eq!(layout.alloc, -48, "two lets + a 2-arg call -> alloc -48");
    }

    #[test]
    fn a_nested_call_argument_adds_the_inner_calls_depth_to_the_outer_arity() {
        // `outer(inner(1, 2, 3))`: the outer call passes 1 argument, and that
        // argument is itself a 3-arg call. the outer call's single argument
        // slot is live WHILE `inner(1, 2, 3)` evaluates -- and inner needs
        // three slots of its own -- so the peak is 1 + 3 = 4, NOT 3. the old
        // max-args planner under-counted this at 3; the spill-depth model must
        // see the inner depth stacked on the outer arity.
        let src = "fn inner(a: i64, b: i64, c: i64) -> i64 { a }\n\
                   fn outer(x: i64) -> i64 { x }\n\
                   fn f() { let r = outer(inner(1, 2, 3)) }";
        let tokens = crate::lexer::Lexer::tokenize(src).expect("lex failed");
        let ast = crate::parser::Parser::parse(&tokens).expect("parse failed");
        let (typed, terrors, _) = crate::typechecker::check_program(&ast, src);
        assert!(terrors.is_empty(), "typecheck errors: {terrors:?}");
        let f = typed
            .iter()
            .find_map(|item| match item {
                crate::typed_ast::TypedItem::Fn(d) if d.name == "f" => Some(d.clone()),
                _ => None,
            })
            .expect("no f function");
        let layout = FrameLayout::plan_frame(&f, &fn_names_of(&typed));
        assert_eq!(
            layout.scratch_count(),
            4,
            "outer arity (1) + inner call depth (3) = 4 scratch slots"
        );
        // 1 let + 4 scratch = 40 bytes -> alloc = -(16 + 40) & -16 = -64.
        assert_eq!(
            layout.alloc, -64,
            "1 let + a nested 3-arg call -> alloc -64"
        );
    }
}