relon-codegen-llvm 0.1.0-rc2

LLVM-backed AOT evaluator for Relon (Phase A bootstrap)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
//! IR -> LLVM IR lowering.
//!
//! Phase B widens the emitter past the Phase A bootstrap envelope:
//!
//! - Two entry shapes:
//!   - **Legacy-i64**: `(I64...) -> I64` — driven by
//!     [`LlvmAotEvaluator::from_ir_direct`]. Mirrors the cranelift
//!     crate's same-named envelope; used by the Phase A bootstrap
//!     tests and the side-by-side `from_ir_direct` benchmarks.
//!   - **Buffer-protocol**: `(*state, i32 in_ptr, i32 in_len,
//!     i32 out_ptr, i32 out_cap, i64 caps) -> i32` — driven by
//!     [`LlvmAotEvaluator::from_source`]. Matches what
//!     `lower_workspace_single` emits for every user source.
//!
//! - Op set widened to the W1 / W2 production-source surface:
//!   `LocalGet`, `ConstI64` / `ConstI32` / `ConstBool`, `LetGet` /
//!   `LetSet`, `LoadField` / `StoreField` (scalar slots: I32 / I64 /
//!   F64 / Bool / Unit), `Add` / `Sub` / `Mul` / `Div` / `Mod` /
//!   `BitAnd` (`I32` and `I64`), comparison ops (`Eq` / `Ne` /
//!   `Lt` / `Le` / `Gt` / `Ge` — `I32` / `I64` / `Bool` for `Eq`/`Ne`),
//!   structured control flow (`Block` / `Loop` / `Br` / `BrIf` /
//!   `If`), and `Return`.
//!
//! Ops outside the Phase B envelope (stdlib `Call`, pointer-indirect
//! `StoreField`, `MakeClosure`, sandbox-trap helpers, schema-method
//! dispatch, …) surface as [`crate::LlvmError::Codegen`]. They are
//! tracked for Phase C.
//!
//! ## Control-flow lowering vs cranelift
//!
//! Cranelift's `block-with-params` keeps phi nodes implicit (every
//! branch passes the carried values as block arguments). LLVM IR
//! requires explicit `phi` nodes per joining basic block. We avoid
//! both by spilling the IR stack through `alloca` slots whenever
//! control flow joins, and reading them back on the consumer side.
//! That mirrors how a naive byte-code-to-LLVM emitter behaves and
//! relies on LLVM's `mem2reg` pass at -O2/-O3 to turn the alloca
//! reads back into SSA values + phis. For the W1 / W2 hot loops
//! `mem2reg` collapses the alloca traffic into a single
//! loop-carried IR value (verified via `emit_ir_dump`'s output at
//! `-O2`).
//!
//! ## Stack discipline
//!
//! The IR's stack machine carries one value per push. We track the
//! per-op operand stack as `Vec<IntValue>` (every IR value the W1/W2
//! envelope produces fits in an integer type — I32 for Bool / I32-
//! tagged values, I64 for I64-tagged values). The wasm-style "every
//! value above the operand stack is unreachable after `br`" rule
//! lets us drop unconsumed stack slots silently — LLVM's verifier
//! catches missing terminators if we forget to seal a block.

use std::collections::HashMap;

use inkwell::builder::Builder;
use inkwell::context::Context;
use inkwell::module::{Linkage, Module as LlvmModule};
use inkwell::types::{BasicMetadataTypeEnum, BasicTypeEnum};
use inkwell::values::{BasicValue, BasicValueEnum, FunctionValue, IntValue, PointerValue};
use inkwell::{AddressSpace, IntPredicate};

use relon_ir::ir::{Func, IrType, Module as IrModule, Op, TaggedOp};

use crate::error::LlvmError;
use crate::state::{ARENA_STATE_OFFSET_BASE, ARENA_STATE_OFFSET_TAIL_CURSOR};

// Per-`Op`-family lowering modules. Each holds an
// `impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp>` block with the `emit_*`
// methods for that family; the exhaustive `lower_op` dispatch below
// delegates to them. Mirrors the cranelift backend's `codegen/*`
// split so Phase 0b can fill unimplemented families in place without
// colliding. (Behavior-preserving reorg — Phase 0a.)
mod arith;
mod call;
mod closure;
mod collections;
mod control;
mod mem;
mod schema;
mod string;
mod unicode;

// Family-local enums consumed by the central `lower_op` dispatch.
use arith::BinOp;
use mem::{AbsLoad, AbsStore};

/// Canonical export name the entry function uses in the emitted LLVM
/// module. The evaluator side `dlsym`s / `get_function`s against this
/// symbol after JIT finalize, so renaming it requires touching both
/// crates simultaneously.
pub(crate) const ENTRY_SYMBOL: &str = "relon_llvm_entry";

/// Tag a `load` instruction with `!invariant.load !{}` so LLVM treats
/// every load from the address as returning the same value for the
/// instruction's lifetime — letting GVN/LICM hoist it out of loops and
/// collapse redundant reloads.
///
/// SOUND ONLY for genuinely call-invariant memory. The single caller is
/// the per-entry / per-lambda `state.arena_base` word load
/// (`ARENA_STATE_OFFSET_BASE`): the host fills the base pointer into the
/// `ArenaState` struct *before* the entry runs and never mutates it for
/// the call's duration (only the scratch / tail cursors at later offsets
/// are written — see `state.rs`; no `build_store` ever targets offset 0).
/// Without this tag LLVM reloads the base from the opaque state pointer on
/// every arena access inside a loop (the W20 n-body inner loop showed a
/// `mov (%state), %base` reload per pair access), because it cannot prove
/// the intervening arena stores don't alias the state struct. The tag is
/// metadata only — it changes no value, so every backend stays
/// bit-identical.
fn mark_invariant_load(ctx: &Context, loaded: BasicValueEnum<'_>) {
    if let Some(inst) = loaded.as_instruction_value() {
        let kind_id = ctx.get_kind_id("invariant.load");
        let empty = ctx.metadata_node(&[]);
        let _ = inst.set_metadata(empty, kind_id);
    }
}

/// Phase D.1 dispatch-boundary fast path: a second exported entry
/// emitted alongside the buffer-protocol entry whenever the source's
/// `#main(Int...) -> Int` shape qualifies. Skips the HashMap pack +
/// arena round-trip the buffer envelope incurs, dropping the per-call
/// boundary cost from the ~650 ns band into the rust-native ballpark.
///
/// Only resolved when the evaluator's [`FastPathProfile`] is `Some`;
/// the symbol is absent from the JIT module otherwise.
pub(crate) const ENTRY_SYMBOL_FAST: &str = "relon_llvm_entry_fast";

/// Which signature the LLVM emitter should generate. Mirrors the
/// cranelift crate's `EntryShape` enum so a side-by-side comparison
/// of the two backends shares the same vocabulary.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum EntryShape {
    /// `(I64...) -> I64`. The Phase A bootstrap envelope — used by
    /// `from_ir_direct` callers (tests, helloworld_arith fixtures).
    LegacyI64,
    /// `(*state, i32 in_ptr, i32 in_len, i32 out_ptr, i32 out_cap,
    /// i64 caps) -> i32`. The shape `lower_workspace_single`
    /// synthesises for every user `#main` source. State is the
    /// first parameter to match the cranelift backend's
    /// `BufferEntryFn` layout.
    Buffer,
}

/// Stage 1.B: whether `Op::CallNative` lowers to **open-world**
/// dynamic dispatch (the `relon_llvm_call_native` helper resolved at
/// runtime via `add_global_mapping`) or **closed-world** static
/// dispatch (a direct `call @<host_symbol>` to an `extern` declaration
/// the LTO co-compile step later links + inlines).
///
/// `OpenWorld` is the default and the only path MCJIT / `from_source`
/// ever uses — it must stay reachable verbatim. `ClosedWorld` is only
/// selected by the co-compile orchestration (`crate::cocompile`) when
/// the full host-fn set is known at emit time (the build.rs /
/// `emit_object` path), mirroring cranelift's *static* `cap_lookup ->
/// fn_ptr` arm rather than its `_dynamic` helper arm.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum WorldMode {
    /// Dynamic dispatch through `relon_llvm_call_native`. Default so
    /// the existing MCJIT / `from_source` path is untouched.
    #[default]
    OpenWorld,
    /// Static `call @<host_symbol>` to an external declaration. The
    /// host bitcode is linked in + inlined by the LTO co-compile pass.
    ClosedWorld,
}

/// Phase D.1 fast-path profile: describes a `#main(Int...) -> Int`
/// source shape eligible for the typed legacy-i64 dispatch fast path.
///
/// The profile maps each declared `#main` Int parameter's buffer
/// offset to the LLVM fast entry's i64 positional slot, and records
/// the offset of the single Int return slot so the trailing
/// `StoreField` can be rewritten into a `ret`. Used exclusively by
/// [`emit_fast_entry`].
#[derive(Debug, Clone)]
pub(crate) struct FastPathProfile {
    /// One entry per declared `#main` arg: the field's byte offset in
    /// the input buffer (matches what `LoadField { offset }` carries
    /// in the IR body) and the i64 slot index in the fast entry
    /// signature. Vector order parallels schema declaration order.
    pub(crate) arg_offsets: Vec<u32>,
    /// Byte offset of the single `value` field in the return buffer.
    /// The trailing `StoreField { offset, ty: I64 }` whose offset
    /// matches this value gets rewritten into a `ret` on the value
    /// (after popping the IR stack normally). Any other `StoreField`
    /// surfaces as an emitter error — the fast path only handles
    /// single-value-wrapper returns.
    pub(crate) ret_offset: u32,
}

/// Phase E.1: per-module const-pool blob laid out at compile time and
/// copied into the arena prefix on every dispatch. Mirrors
/// `relon_codegen_cranelift::codegen::ConstPool` (shape only — the LLVM
/// side keeps it scoped to this crate so the dep direction stays
/// one-way).
///
/// Layout: `[len: u32 LE][utf8 bytes]` records emitted in IR-walk
/// order, aligned to 4. Each `Op::ConstString { idx }` resolves to
/// `string_offsets[idx]` — the byte offset of its record inside
/// [`Self::bytes`] (= the arena-relative offset once the host has
/// copied the blob to the arena prefix).
#[derive(Debug, Default, Clone)]
pub struct ConstPool {
    /// `idx -> byte offset within `bytes`. The emitter materialises
    /// `Op::ConstString { idx }` as `iconst(I32, string_offsets[idx])`.
    pub string_offsets: std::collections::HashMap<u32, u32>,
    /// `List<Int>` pool: `idx -> byte offset`. Mirrors cranelift's
    /// `ConstPool::list_int_offsets`; record layout is
    /// `[len: u32 LE][pad: u32][i64 elements LE]`, aligned to 8.
    pub list_int_offsets: std::collections::HashMap<u32, u32>,
    /// `List<Float>` pool: `idx -> byte offset`. Same layout as
    /// `list_int_offsets` (f64 elements stored as their u64 LE
    /// bit-pattern), aligned to 8.
    pub list_float_offsets: std::collections::HashMap<u32, u32>,
    /// `List<Bool>` pool: `idx -> byte offset`. Record layout is
    /// `[len: u32 LE][u8 booleans]` (tightly packed), aligned to 4.
    pub list_bool_offsets: std::collections::HashMap<u32, u32>,
    /// W5-P2: `List<String>` pointer-array pool: `idx -> header byte
    /// offset`. Record layout (byte-identical to cranelift's
    /// `visit_const_list_string`): each element's `[slen: u32 LE][utf8]`
    /// String record is emitted first (4-aligned), then the header
    /// `[len: u32 LE][off_0: u32 LE]...[off_{N-1}: u32 LE]` whose
    /// `off_i` is the arena-relative offset of String record `i`.
    pub list_string_offsets: std::collections::HashMap<u32, u32>,
    /// W5-P1/P3: `{String -> Int}` dict pool: `idx -> record byte
    /// offset`. Record layout (byte-identical to cranelift's
    /// `visit_const_dict`): `[entry_count: u32 LE][pad: u32][shape_hash:
    /// u64 LE]` header, a `[key_off: u32][key_len: u32][value: i64]`
    /// entry table sorted by key bytes, then the concatenated UTF-8 key
    /// payload (`key_off` record-relative). The W5-P3 dict-get probe
    /// binary-/linear-searches this table at runtime.
    pub dict_offsets: std::collections::HashMap<u32, u32>,
    /// Wave R14: Unicode `*TableAddr` pool. Each distinct
    /// [`unicode::UnicodeTable`] referenced anywhere in the module (incl.
    /// inlined bundled-stdlib helper bodies) is encoded once via the
    /// shared `relon_ir` encoders and laid into [`Self::bytes`]; the
    /// `*TableAddr` op resolves to the recorded arena-relative offset.
    /// Byte-identical to cranelift's per-table `ConstPool` slots.
    pub(crate) unicode_table_offsets: std::collections::HashMap<unicode::UnicodeTable, u32>,
    /// Materialised bytes in record order. The host trampoline copies
    /// these verbatim to `arena[..bytes.len()]` before every dispatch.
    pub bytes: Vec<u8>,
}

impl ConstPool {
    /// Build the pool by walking every function body in `module` and
    /// collecting each unique `Op::ConstString { idx, value }`. Records
    /// are laid out in walk-order with 4-byte alignment.
    pub fn from_module(module: &IrModule) -> Result<Self, LlvmError> {
        let mut pool = ConstPool::default();
        for func in &module.funcs {
            pool.collect_body(&func.body)?;
        }
        Ok(pool)
    }

    fn collect_body(&mut self, body: &[TaggedOp]) -> Result<(), LlvmError> {
        for tagged in body {
            self.collect_op(&tagged.op)?;
        }
        Ok(())
    }

    fn collect_op(&mut self, op: &Op) -> Result<(), LlvmError> {
        match op {
            Op::ConstString { idx, value } => self.add_string(*idx, value),
            Op::ConstListInt { idx, elements } => self.add_list_int(*idx, elements),
            Op::ConstListFloat { idx, elements } => self.add_list_float(*idx, elements),
            Op::ConstListBool { idx, elements } => self.add_list_bool(*idx, elements),
            Op::ConstListString { idx, elements } => self.add_list_string(*idx, elements),
            Op::ConstDict { idx, entries } => self.add_dict(*idx, entries),
            Op::Block { body, .. } | Op::Loop { body, .. } => self.collect_body(body),
            Op::If {
                then_body,
                else_body,
                ..
            } => {
                self.collect_body(then_body)?;
                self.collect_body(else_body)
            }
            // Op::Call inlines a bundled-stdlib body whose own
            // `Op::ConstString` literals must also land in the pool —
            // mirror cranelift's recursion through `builtin_stdlib`.
            Op::Call { fn_index, .. } => {
                let stdlib = relon_ir::stdlib::builtin_stdlib();
                if let Some(callee) = stdlib.get(*fn_index as usize) {
                    let body = callee.body_owned();
                    self.collect_body(&body)?;
                }
                Ok(())
            }
            // Wave R14: Unicode `*TableAddr` ops. Lay each referenced
            // table into the const prefix once (deduped by table identity)
            // so the lowering resolves to a fixed offset instead of
            // copying the table into scratch per op-execution.
            other => {
                if let Some(table) = unicode::UnicodeTable::from_op(other) {
                    self.add_unicode_table(table)?;
                }
                Ok(())
            }
        }
    }

    /// Lay `table`'s encoded bytes into the pool on first reference and
    /// record the arena-relative offset. The byte encoder is the exact
    /// shared `relon_ir` function cranelift's `ConstPool` calls, so the
    /// data a lookup helper reads is byte-identical across backends.
    /// Aligned to 4 to match every `*TableAddr` slot on the cranelift
    /// side (the table headers are read with 4-byte-aligned i32 loads).
    fn add_unicode_table(&mut self, table: unicode::UnicodeTable) -> Result<(), LlvmError> {
        if self.unicode_table_offsets.contains_key(&table) {
            return Ok(());
        }
        self.align_to(4);
        let off = u32::try_from(self.bytes.len())
            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
        let bytes = table.encode_bytes();
        self.bytes.extend_from_slice(&bytes);
        self.unicode_table_offsets.insert(table, off);
        Ok(())
    }

    fn add_string(&mut self, idx: u32, value: &str) -> Result<(), LlvmError> {
        if self.string_offsets.contains_key(&idx) {
            return Ok(());
        }
        // Align to 4 so the `[len: u32]` header lands on a 4-byte
        // boundary — i32 loads through the JIT use `align=4` and we
        // don't want an unaligned trap on hosts where it matters.
        self.align_to(4);
        let off = u32::try_from(self.bytes.len())
            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
        let len = u32::try_from(value.len())
            .map_err(|_| LlvmError::Codegen("ConstString length exceeds u32 range".into()))?;
        self.bytes.extend_from_slice(&len.to_le_bytes());
        self.bytes.extend_from_slice(value.as_bytes());
        self.string_offsets.insert(idx, off);
        Ok(())
    }

    /// Pad `bytes` up to the next `align` boundary with zero fill.
    /// Mirrors cranelift's `ConstPool::align_to`.
    fn align_to(&mut self, align: usize) {
        let rem = self.bytes.len() % align;
        if rem != 0 {
            self.bytes.resize(self.bytes.len() + (align - rem), 0);
        }
    }

    /// Lay out a `List<Int>` record. Byte layout
    /// `[len: u32 LE][pad: u32 zero][i64 elements LE]`, aligned to 8 —
    /// byte-identical to cranelift's `visit_const_list_int` (cross-
    /// backend arena data contract).
    fn add_list_int(&mut self, idx: u32, elements: &[i64]) -> Result<(), LlvmError> {
        if self.list_int_offsets.contains_key(&idx) {
            return Ok(());
        }
        self.align_to(8);
        let off = u32::try_from(self.bytes.len())
            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
        let len = u32::try_from(elements.len())
            .map_err(|_| LlvmError::Codegen("ConstListInt length exceeds u32 range".into()))?;
        self.bytes.extend_from_slice(&len.to_le_bytes());
        self.bytes.extend_from_slice(&[0u8; 4]); // pad to 8
        for e in elements {
            self.bytes.extend_from_slice(&e.to_le_bytes());
        }
        self.list_int_offsets.insert(idx, off);
        Ok(())
    }

    /// Lay out a `List<Float>` record. Same layout as `add_list_int`
    /// (f64 elements stored as their u64 LE bit-pattern), aligned to 8 —
    /// byte-identical to cranelift's `visit_const_list_float`.
    fn add_list_float(&mut self, idx: u32, elements: &[u64]) -> Result<(), LlvmError> {
        if self.list_float_offsets.contains_key(&idx) {
            return Ok(());
        }
        self.align_to(8);
        let off = u32::try_from(self.bytes.len())
            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
        let len = u32::try_from(elements.len())
            .map_err(|_| LlvmError::Codegen("ConstListFloat length exceeds u32 range".into()))?;
        self.bytes.extend_from_slice(&len.to_le_bytes());
        self.bytes.extend_from_slice(&[0u8; 4]); // pad to 8
        for e in elements {
            self.bytes.extend_from_slice(&e.to_le_bytes());
        }
        self.list_float_offsets.insert(idx, off);
        Ok(())
    }

    /// Lay out a `List<Bool>` record. Byte layout
    /// `[len: u32 LE][u8 booleans]` (tightly packed), aligned to 4 —
    /// byte-identical to cranelift's `visit_const_list_bool`.
    fn add_list_bool(&mut self, idx: u32, elements: &[bool]) -> Result<(), LlvmError> {
        if self.list_bool_offsets.contains_key(&idx) {
            return Ok(());
        }
        self.align_to(4);
        let off = u32::try_from(self.bytes.len())
            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
        let len = u32::try_from(elements.len())
            .map_err(|_| LlvmError::Codegen("ConstListBool length exceeds u32 range".into()))?;
        self.bytes.extend_from_slice(&len.to_le_bytes());
        for e in elements {
            self.bytes.push(if *e { 1 } else { 0 });
        }
        self.list_bool_offsets.insert(idx, off);
        Ok(())
    }

    /// W5-P2: lay out a `List<String>` pointer-array record. Each
    /// element's `[slen: u32 LE][utf8]` String record is emitted first
    /// (4-aligned), then the header `[len: u32 LE][off_0: u32 LE]...`
    /// whose `off_i` is the arena-relative offset of String record `i`.
    /// Byte-identical to cranelift's `visit_const_list_string` (cross-
    /// backend arena data contract); the `idx -> header offset` map is
    /// what `Op::ConstListString` resolves to.
    fn add_list_string(&mut self, idx: u32, elements: &[String]) -> Result<(), LlvmError> {
        if self.list_string_offsets.contains_key(&idx) {
            return Ok(());
        }
        self.align_to(4);
        let mut str_offsets: Vec<u32> = Vec::with_capacity(elements.len());
        for s in elements {
            self.align_to(4);
            let s_off = u32::try_from(self.bytes.len()).map_err(|_| {
                LlvmError::Codegen("ConstListString string offset exceeds u32".into())
            })?;
            let slen = u32::try_from(s.len()).map_err(|_| {
                LlvmError::Codegen("ConstListString element length exceeds u32".into())
            })?;
            self.bytes.extend_from_slice(&slen.to_le_bytes());
            self.bytes.extend_from_slice(s.as_bytes());
            str_offsets.push(s_off);
        }
        self.align_to(4);
        let header_off = u32::try_from(self.bytes.len())
            .map_err(|_| LlvmError::Codegen("ConstListString header offset exceeds u32".into()))?;
        let len = u32::try_from(elements.len())
            .map_err(|_| LlvmError::Codegen("ConstListString length exceeds u32".into()))?;
        self.bytes.extend_from_slice(&len.to_le_bytes());
        for off in &str_offsets {
            self.bytes.extend_from_slice(&off.to_le_bytes());
        }
        self.list_string_offsets.insert(idx, header_off);
        Ok(())
    }

    /// W5-P1/P3: lay out a `{String -> Int}` dict record. Byte-identical
    /// to cranelift's `const_pool::visit_const_dict` (cross-backend
    /// arena data contract) so the W5-P3 dict-get probe reads the same
    /// bytes on either backend:
    ///
    /// ```text
    /// [entry_count: u32 LE][pad: u32][shape_hash: u64 LE]   ; 16-byte header
    /// entry_count × [key_off: u32 LE][key_len: u32 LE][value: i64 LE]
    /// concatenated UTF-8 key bytes                          ; key_off record-rel
    /// ```
    ///
    /// The entry table is sorted by key bytes (deterministic + probe-
    /// friendly); the record start is 8-aligned so the i64 values + the
    /// u64 shape_hash land on natural boundaries.
    fn add_dict(&mut self, idx: u32, entries: &[(String, i64)]) -> Result<(), LlvmError> {
        if self.dict_offsets.contains_key(&idx) {
            return Ok(());
        }
        self.align_to(8);
        let off = u32::try_from(self.bytes.len())
            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;

        let mut sorted: Vec<&(String, i64)> = entries.iter().collect();
        sorted.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));

        let entry_count = u32::try_from(sorted.len())
            .map_err(|_| LlvmError::Codegen("ConstDict entry count exceeds u32".into()))?;
        let shape_hash =
            relon_ir::shape_hash::shape_hash_for_keys(sorted.iter().map(|(k, _)| k.as_str()));

        // Header.
        self.bytes.extend_from_slice(&entry_count.to_le_bytes());
        self.bytes.extend_from_slice(&[0u8; 4]); // pad: keep shape_hash 8-aligned
        self.bytes.extend_from_slice(&shape_hash.to_le_bytes());

        const HEADER_BYTES: u32 = 16;
        const ENTRY_BYTES: u32 = 16;
        let table_bytes = entry_count
            .checked_mul(ENTRY_BYTES)
            .ok_or_else(|| LlvmError::Codegen("ConstDict table size overflow".into()))?;
        let key_payload_base = HEADER_BYTES
            .checked_add(table_bytes)
            .ok_or_else(|| LlvmError::Codegen("ConstDict key base overflow".into()))?;

        // Entry table. key_off is record-relative; accumulate as we go.
        let mut running_key_off = key_payload_base;
        for (key, value) in &sorted {
            let key_len = u32::try_from(key.len())
                .map_err(|_| LlvmError::Codegen("ConstDict key length exceeds u32".into()))?;
            self.bytes.extend_from_slice(&running_key_off.to_le_bytes());
            self.bytes.extend_from_slice(&key_len.to_le_bytes());
            self.bytes.extend_from_slice(&value.to_le_bytes());
            running_key_off = running_key_off
                .checked_add(key_len)
                .ok_or_else(|| LlvmError::Codegen("ConstDict key offset overflow".into()))?;
        }

        // Key payload.
        for (key, _) in &sorted {
            self.bytes.extend_from_slice(key.as_bytes());
        }

        self.dict_offsets.insert(idx, off);
        Ok(())
    }
}

/// IR param signature that triggers [`EntryShape::Buffer`]. Mirrors
/// `is_buffer_protocol_signature` on the cranelift side.
pub(crate) fn is_buffer_protocol_signature(params: &[IrType], ret: IrType) -> bool {
    matches!(
        params,
        [
            IrType::I32,
            IrType::I32,
            IrType::I32,
            IrType::I32,
            IrType::I64
        ]
    ) && matches!(ret, IrType::I32)
}

/// Phase E.2 multi-function emit: lower every reachable IR function
/// into LLVM. The entry function `entry` is emitted under either the
/// legacy-i64 or buffer-protocol shape; each entry in `helpers` is
/// emitted as a sibling helper function with a plain typed
/// `(params...) -> ret` signature so the entry's `Op::Call` lowering
/// can route to it through a direct LLVM `call` instruction.
///
/// `helper_ir_indices` parallels `helpers`: entry `i` carries the
/// IR-side `funcs` index for the matching helper. Used by the
/// `Op::Call` lowering to resolve `fn_index - stdlib_count` back to the
/// matching `FunctionValue`.
///
/// Phase F.W7 widens the surface to closures-as-values:
///
/// - `lambdas` carries the IR funcs the lowering pass appended to the
///   module's closure table (`#main`-side `fib: (k) => ...` lifts to a
///   lambda Func). Each lambda is declared / emitted with the
///   signature `(state, captures_ptr, ...lambda.params[1..]) -> ret`
///   so the body's `LocalGet(0)` reads the captures_ptr arg, and so
///   `Op::AllocScratch` / `*AtAbsolute` ops inside the body can reach
///   the per-call arena state.
/// - `closure_table` mirrors the IR's `Module::closure_table` so the
///   emitter knows which `fn_table_idx` resolves to which lambda
///   `FunctionValue`. Returned alongside `helper_table` so the
///   `Op::MakeClosure` / `Op::CallClosure` lowering can refer to it.
///
/// `const_pool` ships the per-module ConstString blob the entry +
/// helper bodies index into via `Op::ConstString { idx }`. The host
/// copies `const_pool.bytes` to the arena prefix before every
/// dispatch so the materialised `iconst(I32, offset)` resolves to a
/// stable address.
///
/// Returns the entry `FunctionValue`, the detected entry shape, the
/// helper lookup table the `Emit` driver hands off to the per-function
/// lowering so sibling calls can find their callee, and the closure
/// table (one entry per `fn_table_idx`, in source order).
/// Open-world entry point (the only one MCJIT / `from_source` use).
/// `Op::CallNative` lowers to the dynamic `relon_llvm_call_native`
/// helper. Signature kept stable so the `evaluator.rs` call sites are
/// untouched.
#[allow(clippy::too_many_arguments, clippy::type_complexity)]
pub(crate) fn emit_module_funcs<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    entry: &Func,
    buffer_return_size: u32,
    const_pool: &ConstPool,
    helpers: &[&Func],
    helper_ir_indices: Option<&[u32]>,
    lambdas: &[&Func],
    closure_table: &[u32],
    imports: &[relon_ir::ir::NativeImport],
) -> Result<
    (
        FunctionValue<'ctx>,
        EntryShape,
        HashMap<u32, FunctionValue<'ctx>>,
        Vec<FunctionValue<'ctx>>,
    ),
    LlvmError,
> {
    emit_module_funcs_impl(
        ctx,
        module,
        entry,
        buffer_return_size,
        const_pool,
        helpers,
        helper_ir_indices,
        lambdas,
        closure_table,
        imports,
        WorldMode::OpenWorld,
        crate::CodegenTarget::Native,
        &[],
    )
}

/// P3 §2.2 wasm32 entry point. Same open-world dispatch as
/// [`emit_module_funcs`] but targets wasm32 so an `Op::CallNative`
/// lowers to a **wasm import** call ([`crate::wasi_host`]) instead of the
/// native `relon_llvm_call_native` MCJIT helper. Used only by the
/// `emit_object_for_target(.., CodegenTarget::Wasm32)` object-emit path.
#[allow(clippy::too_many_arguments, clippy::type_complexity)]
pub(crate) fn emit_module_funcs_wasm<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    entry: &Func,
    buffer_return_size: u32,
    const_pool: &ConstPool,
    helpers: &[&Func],
    helper_ir_indices: Option<&[u32]>,
    lambdas: &[&Func],
    closure_table: &[u32],
    imports: &[relon_ir::ir::NativeImport],
) -> Result<
    (
        FunctionValue<'ctx>,
        EntryShape,
        HashMap<u32, FunctionValue<'ctx>>,
        Vec<FunctionValue<'ctx>>,
    ),
    LlvmError,
> {
    emit_module_funcs_impl(
        ctx,
        module,
        entry,
        buffer_return_size,
        const_pool,
        helpers,
        helper_ir_indices,
        lambdas,
        closure_table,
        imports,
        WorldMode::OpenWorld,
        crate::CodegenTarget::Wasm32,
        &[],
    )
}

/// Stage 1.B closed-world entry point. `Op::CallNative` lowers to a
/// static `call @<host_symbol>` against an `extern` declaration; the
/// host bitcode is linked in + inlined by [`crate::cocompile`]. Used
/// only by the co-compile orchestration — never by MCJIT / `from_source`.
#[allow(clippy::too_many_arguments, clippy::type_complexity)]
pub(crate) fn emit_module_funcs_closed_world<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    entry: &Func,
    buffer_return_size: u32,
    const_pool: &ConstPool,
    helpers: &[&Func],
    helper_ir_indices: Option<&[u32]>,
    lambdas: &[&Func],
    closure_table: &[u32],
    imports: &[relon_ir::ir::NativeImport],
) -> Result<
    (
        FunctionValue<'ctx>,
        EntryShape,
        HashMap<u32, FunctionValue<'ctx>>,
        Vec<FunctionValue<'ctx>>,
    ),
    LlvmError,
> {
    emit_module_funcs_impl(
        ctx,
        module,
        entry,
        buffer_return_size,
        const_pool,
        helpers,
        helper_ir_indices,
        lambdas,
        closure_table,
        imports,
        WorldMode::ClosedWorld,
        crate::CodegenTarget::Native,
        &[],
    )
}

/// P3 §2.2 wasm closed-world co-compile entry point. Like
/// [`emit_module_funcs_closed_world`] but targets **wasm32**: a
/// pure-compute `Op::CallNative` (an import whose `effectful_imports`
/// flag is `false`) lowers to a direct `call @<host_symbol>` that the
/// wasm host-shim co-compile ([`crate::cocompile::link_and_inline_host_shim_wasm`])
/// links + inlines into the wasm unit, mirroring the native closed-world
/// inline. An **effectful** import (flag `true` — capability-gated by a
/// preceding `Op::CheckCap`) instead routes to a **wasm import** call
/// ([`crate::wasi_host`]) so the side effect crosses the sandbox boundary
/// back out to the trusted host (ADR §2.2: pure inline, effectful → WASI).
///
/// `effectful_imports[i]` is the per-`import_idx` effectful flag; the
/// caller (`emit_object_for_target`) derives it from the IR's
/// CheckCap → CallNative shape.
#[allow(clippy::too_many_arguments, clippy::type_complexity)]
pub(crate) fn emit_module_funcs_closed_world_wasm<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    entry: &Func,
    buffer_return_size: u32,
    const_pool: &ConstPool,
    helpers: &[&Func],
    helper_ir_indices: Option<&[u32]>,
    lambdas: &[&Func],
    closure_table: &[u32],
    imports: &[relon_ir::ir::NativeImport],
    effectful_imports: &[bool],
) -> Result<
    (
        FunctionValue<'ctx>,
        EntryShape,
        HashMap<u32, FunctionValue<'ctx>>,
        Vec<FunctionValue<'ctx>>,
    ),
    LlvmError,
> {
    emit_module_funcs_impl(
        ctx,
        module,
        entry,
        buffer_return_size,
        const_pool,
        helpers,
        helper_ir_indices,
        lambdas,
        closure_table,
        imports,
        WorldMode::ClosedWorld,
        crate::CodegenTarget::Wasm32,
        effectful_imports,
    )
}

#[allow(clippy::too_many_arguments, clippy::type_complexity)]
fn emit_module_funcs_impl<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    entry: &Func,
    buffer_return_size: u32,
    const_pool: &ConstPool,
    helpers: &[&Func],
    helper_ir_indices: Option<&[u32]>,
    lambdas: &[&Func],
    closure_table: &[u32],
    imports: &[relon_ir::ir::NativeImport],
    world_mode: WorldMode,
    target: crate::CodegenTarget,
    effectful_imports: &[bool],
) -> Result<
    (
        FunctionValue<'ctx>,
        EntryShape,
        HashMap<u32, FunctionValue<'ctx>>,
        Vec<FunctionValue<'ctx>>,
    ),
    LlvmError,
> {
    // Step 0: declare module-level intrinsics. `llvm.trap` is shared
    // by every Div / Mod sandbox guard so a single declaration covers
    // every per-op guard across every emitted function.
    declare_llvm_trap(ctx, module);

    // Step 1: declare every helper up-front so the entry / sibling
    // bodies can resolve forward references (mutual recursion, the
    // `fib(n - 1) + fib(n - 2)` self-call). LLVM is happy to issue
    // `call @foo` against a declared-only function; the body is
    // attached on the second pass.
    let mut helper_table: HashMap<u32, FunctionValue<'ctx>> = HashMap::new();
    if let Some(ir_indices) = helper_ir_indices {
        if ir_indices.len() != helpers.len() {
            return Err(LlvmError::Codegen(format!(
                "emit_module_funcs: helpers.len()={} but helper_ir_indices.len()={}",
                helpers.len(),
                ir_indices.len()
            )));
        }
    }
    for (i, helper) in helpers.iter().enumerate() {
        let fv = declare_helper_function(ctx, module, helper, i)?;
        let ir_idx = helper_ir_indices.map(|v| v[i]).unwrap_or(i as u32);
        helper_table.insert(ir_idx, fv);
    }

    // Phase F.W7: declare every lambda function up-front. Lambdas use
    // a widened signature `(state, ...lambda.params) -> ret` — the
    // first IR param (already `IrType::I32`, the captures_ptr the IR
    // lowering pass prepended in `lower_closure_as_value`) becomes
    // LLVM param 1 (just past the implicit `*state`). Subsequent
    // user params shift to LLVM param indices 2.. so the body's
    // `LocalGet(idx)` resolves to LLVM param `idx + 1`
    // (`param_base = 1`).
    let mut closure_fn_table: Vec<FunctionValue<'ctx>> = Vec::with_capacity(closure_table.len());
    if lambdas.len() != closure_table.len() {
        return Err(LlvmError::Codegen(format!(
            "emit_module_funcs: lambdas.len()={} but closure_table.len()={}",
            lambdas.len(),
            closure_table.len()
        )));
    }
    for (slot, lambda) in lambdas.iter().enumerate() {
        let fv = declare_lambda_function(ctx, module, lambda, slot)?;
        closure_fn_table.push(fv);
    }

    // Step 2: emit the entry function body.
    let (entry_fn, shape) = if is_buffer_protocol_signature(&entry.params, entry.ret) {
        let fv = emit_buffer_entry_with_helpers_and_closures(
            ctx,
            module,
            entry,
            buffer_return_size,
            const_pool,
            &helper_table,
            &closure_fn_table,
            imports,
            world_mode,
            target,
            effectful_imports,
        )?;
        (fv, EntryShape::Buffer)
    } else {
        // The legacy-i64 entry shape covers hand-built fixtures only; it
        // never references ConstString and supplies its own empty pool
        // inside `emit_legacy_entry_impl`.
        let fv =
            emit_legacy_entry_with_helpers(ctx, module, entry, &helper_table, imports, world_mode)?;
        (fv, EntryShape::LegacyI64)
    };

    // Step 3: emit each helper body now that every callee is declared.
    for helper in helpers.iter() {
        let helper_fn = helper_table
            .values()
            .find(|fv| {
                // Locate the FunctionValue by name; cheap enough — the
                // helper table is tiny and the find runs once per
                // helper.
                let expected = format!("relon_helper_{}", helper.name);
                fv.get_name().to_string_lossy() == expected
            })
            .copied()
            .ok_or_else(|| {
                LlvmError::Codegen(format!(
                    "emit_module_funcs: helper `{}` declared but FunctionValue missing",
                    helper.name
                ))
            })?;
        emit_helper_body(ctx, module, helper, helper_fn, const_pool, &helper_table)?;
    }

    // Step 4 (Phase F.W7): emit each lambda body. Lambdas share the
    // `helper_table` so the body can route an inner `Op::Call` to a
    // sibling helper (Phase E.2 cross-call). They also share the
    // `closure_fn_table` so a nested `Op::MakeClosure` resolves the
    // matching lambda FunctionValue from its `fn_table_idx`.
    //
    // Build the module-wide self-capture table once before emitting
    // lambda bodies. The table maps each lambda's `fn_table_idx` to
    // the captures-struct offsets that hold self-recursive handles
    // (i.e. handles whose `captures_ptr` field equals the lambda's
    // own captures_ptr arg). The lambda-body emit uses this table to
    // stamp [`Provenance::OwnCaptureHandle`] on the matching capture
    // loads so the recursive call site can pick the direct-call fast
    // path. Empty for modules that have no self-recursive closures.
    let self_capture_table = build_self_capture_table(entry, helpers, lambdas);
    // Devirtualisation (W18): companion table for captures of known
    // (non-self) closures — lets the W18 predicate's `is_prime` call
    // devirtualise inside the predicate lambda body.
    let known_capture_table = build_known_capture_table(entry, helpers, lambdas);
    for (slot, lambda) in lambdas.iter().enumerate() {
        let lambda_fn = closure_fn_table[slot];
        let slot_u32 = slot as u32;
        let offsets = self_capture_table
            .get(&slot_u32)
            .cloned()
            .unwrap_or_default();
        let known_offsets = known_capture_table
            .get(&slot_u32)
            .cloned()
            .unwrap_or_default();
        emit_lambda_body(
            ctx,
            module,
            lambda,
            lambda_fn,
            const_pool,
            &helper_table,
            &closure_fn_table,
            &offsets,
            &known_offsets,
        )?;
    }

    Ok((entry_fn, shape, helper_table, closure_fn_table))
}

/// Phase F.W7 self-recursion fast path: scan every IR function body
/// (entry + helpers + lambdas) for the canonical
/// `Op::MakeClosure { fn_table_idx, captures } ; Op::LetSet { idx, ty:
/// Closure }` pair and collect the captures whose `let_idx` matches the
/// `LetSet`'s `idx` — those are the self-recursive captures stamped by
/// `lower_closure_as_value`'s "let-slot not yet bound" branch.
///
/// Returns `fn_table_idx -> [(capture_offset, self_fn_table_idx)]` so
/// the lambda body emitter can stamp the matching
/// [`Provenance::OwnCaptureHandle`] on each capture load.
///
/// The scan tolerates intervening ops between `MakeClosure` and
/// `LetSet` (none are emitted today; future lowering passes that
/// interleave additional setup ops would still be matched). It bails
/// silently on patterns it can't recognise — the fast path stays
/// opt-in and the slow-path `emit_call_closure` keeps working
/// regardless.
fn build_self_capture_table(
    entry: &Func,
    helpers: &[&Func],
    lambdas: &[&Func],
) -> HashMap<u32, Vec<(u32, u32)>> {
    let mut table: HashMap<u32, Vec<(u32, u32)>> = HashMap::new();

    let scan = |func: &Func, table: &mut HashMap<u32, Vec<(u32, u32)>>| {
        let ops = &func.body;
        for (i, tagged) in ops.iter().enumerate() {
            // Find a MakeClosure immediately followed by a matching
            // `LetSet { ty: Closure }`. The IR lowering pass emits
            // these adjacently (see `lower_anon_dict_body` /
            // `lower_closure_as_value`); intervening ops break the
            // simple match and the slow-path dispatch keeps working.
            let Op::MakeClosure {
                fn_table_idx,
                ref captures,
                ..
            } = tagged.op
            else {
                continue;
            };
            let Some(next) = ops.get(i + 1) else {
                continue;
            };
            let Op::LetSet {
                idx,
                ty: relon_ir::ir::IrType::Closure,
            } = next.op
            else {
                continue;
            };
            for cap in captures {
                if cap.let_idx == idx && matches!(cap.ty, relon_ir::ir::IrType::Closure) {
                    table
                        .entry(fn_table_idx)
                        .or_default()
                        .push((cap.offset, fn_table_idx));
                }
            }
        }
    };

    scan(entry, &mut table);
    for h in helpers {
        scan(h, &mut table);
    }
    for l in lambdas {
        scan(l, &mut table);
    }
    table
}

/// Devirtualisation (W18, 2026-05-30): companion to
/// [`build_self_capture_table`] for *non-self* captures of a closure
/// whose `fn_table_idx` is a compile-time constant.
///
/// Maps each lambda's `fn_table_idx` to the captures-struct offsets that
/// hold a handle produced by a literal `Op::MakeClosure { K }` (a
/// *known* closure), together with that `K`. The lambda-body emit uses
/// this to stamp [`Provenance::KnownClosure`] on the matching capture
/// load (the prologue `LocalGet(0); LoadI32AtAbsolute { offset };
/// LetSet { Closure }`), so a `CallClosure` against the capture (e.g.
/// the W18 predicate's `is_prime(k, 2)` call) emits a direct call
/// instead of the runtime `switch i32 %cc_fn_idx`.
///
/// Soundness: within each function we track, in source order, the
/// most-recent `MakeClosure { K }; LetSet { idx, Closure }` assignment
/// per outer let-slot. Any *other* `LetSet { idx, Closure }` clears the
/// slot — so a let that is reassigned to a dynamically-chosen closure is
/// never recorded as known. A capture is recorded only when its
/// `let_idx` resolves to a still-known slot AND the captured `K` differs
/// from the capturing lambda `L` (a self-capture, `K == L`, is owned by
/// [`build_self_capture_table`], whose `captures_ptr`-reuse fast path is
/// strictly better). The lowering pass emits the capturing
/// `MakeClosure` only after the captured let is bound and reads the live
/// slot, so the tracked `K` is exactly the value the capture holds.
fn build_known_capture_table(
    entry: &Func,
    helpers: &[&Func],
    lambdas: &[&Func],
) -> HashMap<u32, Vec<(u32, u32)>> {
    use relon_ir::ir::IrType as Irt;
    let mut table: HashMap<u32, Vec<(u32, u32)>> = HashMap::new();

    let scan = |func: &Func, table: &mut HashMap<u32, Vec<(u32, u32)>>| {
        let ops = &func.body;
        // outer let-slot -> known captured `fn_table_idx`, last-write
        // wins; cleared when the slot is reassigned a non-known closure.
        let mut known_slots: HashMap<u32, u32> = HashMap::new();
        for (i, tagged) in ops.iter().enumerate() {
            // Maintain `known_slots` off each `LetSet { idx, Closure }`:
            // if the immediately-preceding op is a `MakeClosure { K }`
            // (the canonical `MakeClosure; LetSet` binding the lowering
            // emits) the slot becomes a *known* closure `K`; any other
            // `LetSet { Closure }` stores a value we cannot prove is one
            // statically-known closure, so the slot is dropped. Driving
            // this off the `LetSet` (rather than the `MakeClosure`)
            // avoids the binding `LetSet` clobbering the very entry the
            // preceding `MakeClosure` established.
            if let Op::LetSet {
                idx,
                ty: Irt::Closure,
            } = tagged.op
            {
                if let Some(Op::MakeClosure { fn_table_idx, .. }) =
                    i.checked_sub(1).and_then(|p| ops.get(p)).map(|t| &t.op)
                {
                    known_slots.insert(idx, *fn_table_idx);
                } else {
                    known_slots.remove(&idx);
                }
                continue;
            }
            // At a capturing `MakeClosure { L }`, record each capture
            // that reads a still-known slot. The capturing closure's own
            // handle need NOT be stored to a let — the W18 predicate is
            // passed straight into `_list_filter` — because the fact
            // recorded here is about lambda `L`'s captures-struct layout
            // (offset O holds known closure K), which is fixed by `L`'s
            // own `MakeClosure` captures and the known-ness of the
            // captured outer let, independent of where `L`'s handle goes.
            if let Op::MakeClosure {
                fn_table_idx: l_idx,
                ref captures,
                ..
            } = tagged.op
            {
                for cap in captures {
                    if !matches!(cap.ty, Irt::Closure) {
                        continue;
                    }
                    if let Some(&k_idx) = known_slots.get(&cap.let_idx) {
                        // `k_idx == l_idx` is a self-capture — owned by
                        // `build_self_capture_table`; skip here.
                        if k_idx != l_idx {
                            table.entry(l_idx).or_default().push((cap.offset, k_idx));
                        }
                    }
                }
            }
        }
    };

    scan(entry, &mut table);
    for h in helpers {
        scan(h, &mut table);
    }
    for l in lambdas {
        scan(l, &mut table);
    }
    table
}

/// Devirtualisation (W18) correctness helper: collect every let-slot
/// index that a body assigns via `Op::LetSet { ty: Closure }`, recursing
/// into nested `Op::If` / `Op::Block` / `Op::Loop` bodies. Used by
/// `emit_loop` to conservatively invalidate the `KnownClosure` let-slot
/// tracker for any closure slot the loop body reassigns, so a
/// cross-iteration read cannot devirtualise to a stale target.
fn collect_closure_letset_slots(body: &[TaggedOp], out: &mut Vec<u32>) {
    for t in body {
        match &t.op {
            Op::LetSet {
                idx,
                ty: relon_ir::ir::IrType::Closure,
            } => out.push(*idx),
            Op::If {
                then_body,
                else_body,
                ..
            } => {
                collect_closure_letset_slots(then_body, out);
                collect_closure_letset_slots(else_body, out);
            }
            Op::Block { body, .. } | Op::Loop { body, .. } => {
                collect_closure_letset_slots(body, out);
            }
            _ => {}
        }
    }
}

/// Declare a sibling helper function's LLVM signature without emitting
/// its body. Used to seat every helper into the module so the entry's
/// `Op::Call` lowering can resolve forward references (recursion,
/// mutual recursion). Sibling helpers use a plain typed
/// `(params...) -> ret` shape — no `*state` pointer, no buffer
/// protocol; the test harness drives recursive Int-only functions
/// directly. When the IR layer grows first-class closure values
/// (Phase F), this signature widens to carry `(*state, captures, ...)`.
fn declare_helper_function<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    slot: usize,
) -> Result<FunctionValue<'ctx>, LlvmError> {
    let mut param_types: Vec<BasicMetadataTypeEnum<'ctx>> = Vec::with_capacity(func.params.len());
    for (i, p) in func.params.iter().enumerate() {
        let bt = ir_ty_to_llvm_abi(ctx, *p).ok_or_else(|| {
            LlvmError::UnsupportedSignature(format!(
                "llvm-aot: helper `{}` param #{i} type {p:?} unsupported",
                func.name
            ))
        })?;
        param_types.push(basic_to_metadata(bt));
    }
    let ret_bt = ir_ty_to_llvm_abi(ctx, func.ret).ok_or_else(|| {
        LlvmError::UnsupportedSignature(format!(
            "llvm-aot: helper `{}` return type {:?} unsupported",
            func.name, func.ret
        ))
    })?;
    let fn_type = match ret_bt {
        BasicTypeEnum::IntType(t) => t.fn_type(&param_types, false),
        BasicTypeEnum::FloatType(t) => t.fn_type(&param_types, false),
        BasicTypeEnum::PointerType(t) => t.fn_type(&param_types, false),
        other => {
            return Err(LlvmError::Codegen(format!(
                "llvm-aot: helper `{}` ret BasicType {other:?} unsupported",
                func.name
            )));
        }
    };
    // Use a deterministic LLVM symbol so the entry's call site can be
    // pretty-printed in the IR dump. The slot keeps multiple helpers
    // with the same source name (shouldn't happen, but cheap) from
    // colliding.
    let _ = slot;
    let llvm_name = format!("relon_helper_{}", func.name);
    let fv = module.add_function(&llvm_name, fn_type, Some(Linkage::Internal));
    Ok(fv)
}

/// Phase F.W7: declare a lambda function's LLVM signature without
/// emitting its body. Lambdas always carry the
/// `(state: ptr, ...lambda.params) -> ret` signature — the first IR
/// param is the captures_ptr the IR lowering pass prepended in
/// `lower_closure_as_value`, surfaced through LLVM param 1. Subsequent
/// LLVM params correspond to the lambda's user-visible args.
///
/// The implicit `*state` pointer at LLVM param 0 mirrors the
/// buffer-protocol entry's leading state slot so the lambda body's
/// `Op::AllocScratch{,Dyn}` / `Op::*AtAbsolute` ops can resolve
/// `arena_base` + scratch cursors through the same helper paths the
/// entry uses.
fn declare_lambda_function<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    slot: usize,
) -> Result<FunctionValue<'ctx>, LlvmError> {
    let ptr_t = ctx.ptr_type(AddressSpace::default());
    let mut param_types: Vec<BasicMetadataTypeEnum<'ctx>> =
        Vec::with_capacity(1 + func.params.len());
    param_types.push(ptr_t.into());
    for (i, p) in func.params.iter().enumerate() {
        let bt = ir_ty_to_llvm_abi(ctx, *p).ok_or_else(|| {
            LlvmError::UnsupportedSignature(format!(
                "llvm-aot: lambda `{}` param #{i} type {p:?} unsupported",
                func.name
            ))
        })?;
        param_types.push(basic_to_metadata(bt));
    }
    let ret_bt = ir_ty_to_llvm_abi(ctx, func.ret).ok_or_else(|| {
        LlvmError::UnsupportedSignature(format!(
            "llvm-aot: lambda `{}` return type {:?} unsupported",
            func.name, func.ret
        ))
    })?;
    let fn_type = match ret_bt {
        BasicTypeEnum::IntType(t) => t.fn_type(&param_types, false),
        BasicTypeEnum::FloatType(t) => t.fn_type(&param_types, false),
        BasicTypeEnum::PointerType(t) => t.fn_type(&param_types, false),
        other => {
            return Err(LlvmError::Codegen(format!(
                "llvm-aot: lambda `{}` ret BasicType {other:?} unsupported",
                func.name
            )));
        }
    };
    // `relon_lambda_<slot>_<name>` so the emitted IR dump is greppable
    // when debugging which `fn_table_idx` mapped to which body.
    let llvm_name = format!("relon_lambda_{}_{}", slot, func.name);
    let fv = module.add_function(&llvm_name, fn_type, Some(Linkage::Internal));
    Ok(fv)
}

/// Phase E.2: declare the `llvm.trap` intrinsic on `module` if it is
/// not already present. The intrinsic has signature `void @llvm.trap()`
/// — calling it raises a target-specific trap (a `ud2` on x86-64) that
/// the host's `panic` handler can catch when paired with an
/// `unreachable`. Cheap to call on every emit pass; we keep the lookup
/// idempotent so test fixtures that re-enter the emitter don't end up
/// with duplicate declarations.
fn declare_llvm_trap<'ctx>(ctx: &'ctx Context, module: &LlvmModule<'ctx>) -> FunctionValue<'ctx> {
    if let Some(f) = module.get_function("llvm.trap") {
        return f;
    }
    let void_t = ctx.void_type();
    let fn_ty = void_t.fn_type(&[], false);
    module.add_function("llvm.trap", fn_ty, None)
}

/// Phase 0b: declare the `relon_llvm_call_native` host-dispatch helper
/// on `module` if absent. Signature mirrors the Rust helper:
///
/// ```text
/// i64 relon_llvm_call_native(ptr state, i32 import_idx,
///                            ptr args_ptr, i32 arg_count)
/// ```
///
/// `Linkage::External` so MCJIT resolves it to the host address the
/// evaluator registers via `add_global_mapping` (the default resolver
/// can't see the static from inside the host dylib's section layout —
/// same constraint as the `str.contains` shim). Idempotent so repeated
/// emit passes don't duplicate the declaration.
fn declare_call_native<'ctx>(ctx: &'ctx Context, module: &LlvmModule<'ctx>) -> FunctionValue<'ctx> {
    if let Some(f) = module.get_function(crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL) {
        return f;
    }
    let i64_t = ctx.i64_type();
    let i32_t = ctx.i32_type();
    let ptr_t = ctx.ptr_type(AddressSpace::default());
    let fn_ty = i64_t.fn_type(
        &[ptr_t.into(), i32_t.into(), ptr_t.into(), i32_t.into()],
        false,
    );
    module.add_function(
        crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL,
        fn_ty,
        Some(Linkage::External),
    )
}

/// Stage 1.B closed-world: declare a host `#native` fn as an external
/// `(i64...) -> i64` so `Op::CallNative` can emit a direct
/// `call @<host_symbol>`. Every scalar arg / return rides the i64 lane
/// (Bool / I32 zero-extend in; Unit returns `void`), matching the host
/// shim's `#[no_mangle] extern "C" fn(i64...) -> i64` ABI the
/// co-compile step links in. Idempotent: a repeated import name reuses
/// the existing declaration.
///
/// The lane is deliberately the same i64 width the open-world helper
/// decodes, so the two paths are bit-for-bit differential-comparable.
fn declare_host_fn_direct<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    import: &relon_ir::ir::NativeImport,
) -> FunctionValue<'ctx> {
    if let Some(f) = module.get_function(&import.name) {
        return f;
    }
    let i64_t = ctx.i64_type();
    let params: Vec<BasicMetadataTypeEnum<'ctx>> =
        import.param_tys.iter().map(|_| i64_t.into()).collect();
    let fn_ty = match import.ret_ty {
        IrType::Unit => ctx.void_type().fn_type(&params, false),
        _ => i64_t.fn_type(&params, false),
    };
    module.add_function(&import.name, fn_ty, Some(Linkage::External))
}

/// #359 (W20): map an [`IrType`] to the LLVM type used in a helper /
/// lambda **call ABI** slot. This mirrors the operand-stack
/// convention where `F64` rides as its 64-bit *bit pattern* in an i64
/// register: `F64` maps to `i64`, not `double`. Keeping the ABI int-
/// only means a `CallClosure` / `Op::Call` site never has to bitcast
/// between the i64-bits stack representation and a native-float
/// argument / return slot — the value flows through verbatim. The
/// W20 n-body helpers (`pair_force` / `accel` return `F64`,
/// `pair_force` takes an `F64` mass) are the first closures with a
/// Float in their signature; without this they'd declare a `double`
/// slot that the i64-bits operand stack cannot feed.
fn ir_ty_to_llvm_abi<'ctx>(ctx: &'ctx Context, ty: IrType) -> Option<BasicTypeEnum<'ctx>> {
    match ty {
        IrType::I64 | IrType::F64 => Some(ctx.i64_type().into()),
        IrType::I32 | IrType::Bool | IrType::Unit => Some(ctx.i32_type().into()),
        IrType::String
        | IrType::ListInt
        | IrType::ListFloat
        | IrType::ListBool
        | IrType::ListString
        | IrType::ListSchema
        | IrType::ListList
        | IrType::Closure
        | IrType::Dict => Some(ctx.i32_type().into()),
    }
}

fn basic_to_metadata(bt: BasicTypeEnum<'_>) -> BasicMetadataTypeEnum<'_> {
    match bt {
        BasicTypeEnum::IntType(t) => t.into(),
        BasicTypeEnum::FloatType(t) => t.into(),
        BasicTypeEnum::PointerType(t) => t.into(),
        BasicTypeEnum::ArrayType(t) => t.into(),
        BasicTypeEnum::StructType(t) => t.into(),
        BasicTypeEnum::VectorType(t) => t.into(),
        BasicTypeEnum::ScalableVectorType(t) => t.into(),
    }
}

/// Lower a sibling helper's body against its declared LLVM
/// `FunctionValue`. Mirrors [`emit_legacy_entry`] but without enforcing
/// the legacy-i64 envelope — helpers may carry any
/// [`IrType`]-shaped param / return mix that `ir_ty_to_llvm_abi`
/// accepts.
fn emit_helper_body<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    llvm_fn: FunctionValue<'ctx>,
    const_pool: &ConstPool,
    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
) -> Result<(), LlvmError> {
    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
    let builder = ctx.create_builder();
    builder.position_at_end(entry_bb);

    let mut emit = Emit::new(
        ctx,
        &builder,
        module,
        llvm_fn,
        EntryShape::LegacyI64,
        /*arena_base_ptr=*/ None,
        /*state_ptr=*/ None,
        /*buffer_return_size=*/ 0,
        const_pool,
    );
    // Helper functions have no implicit state slot; `LocalGet(0)` maps
    // straight to LLVM param 0.
    emit.param_base = 0;
    emit.helper_table = Some(helper_table.clone());
    // Record the IR-declared return type so `Op::Return` knows what to
    // widen / truncate to when the operand stack value's width differs
    // from the LLVM signature's return slot.
    emit.helper_ret_ty = Some(func.ret);
    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
    emit.lower_body(&func.body)?;
    Ok(())
}

/// Phase F.W7: emit a lambda body. Mirrors [`emit_helper_body`] but:
///
/// - The first LLVM param (`*state`) is materialised into
///   `arena_base_ptr` + `state_ptr` so the body's
///   `Op::AllocScratch{,Dyn}` / `Op::*AtAbsolute` ops resolve against
///   the per-call arena state. Required because lambdas read captures
///   via `LocalGet(0); LoadI32AtAbsolute { offset }` against the
///   captures struct in scratch.
/// - `param_base = 1` so the IR's `LocalGet(idx)` skips the implicit
///   state slot — `LocalGet(0)` therefore reads the captures_ptr at
///   LLVM param 1, matching what the IR lowering pass laid out in
///   `lower_closure_as_value`.
/// - The closure table is threaded through so nested
///   `Op::MakeClosure` / `Op::CallClosure` ops inside the lambda body
///   keep resolving against the same module-wide lambda set the entry
///   uses.
#[allow(clippy::too_many_arguments)]
fn emit_lambda_body<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    llvm_fn: FunctionValue<'ctx>,
    const_pool: &ConstPool,
    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
    closure_fn_table: &[FunctionValue<'ctx>],
    self_capture_offsets: &[(u32, u32)],
    known_capture_offsets: &[(u32, u32)],
) -> Result<(), LlvmError> {
    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
    let builder = ctx.create_builder();
    builder.position_at_end(entry_bb);

    // Materialise `state_ptr` + `arena_base_ptr` at function entry.
    // Same pointer-arithmetic shape the buffer entry uses — the lambda
    // shares the per-call `ArenaState` layout because the host (the
    // entry function or another lambda) passes its own state pointer
    // through to the call indirect site verbatim.
    let i32_t = ctx.i32_type();
    let i64_t = ctx.i64_type();
    let i8_t = ctx.i8_type();
    let ptr_t = ctx.ptr_type(AddressSpace::default());
    let state_param = llvm_fn
        .get_nth_param(0)
        .ok_or_else(|| LlvmError::Codegen(format!("lambda `{}` missing state param", func.name)))?
        .into_pointer_value();
    let arena_base_gep = unsafe {
        builder
            .build_in_bounds_gep(
                i8_t,
                state_param,
                &[i32_t.const_int(ARENA_STATE_OFFSET_BASE as u64, false)],
                "lambda_arena_base_gep",
            )
            .map_err(|e| LlvmError::Codegen(format!("lambda arena_base GEP: {e}")))?
    };
    // TODO(P3-wasm32): use DataLayout pointer width instead of i64
    // for the arena-base word load + inttoptr below.
    let arena_base_load = builder
        .build_load(i64_t, arena_base_gep, "lambda_arena_base")
        .map_err(|e| LlvmError::Codegen(format!("lambda arena_base load: {e}")))?;
    mark_invariant_load(ctx, arena_base_load);
    let arena_base_int = arena_base_load.into_int_value();
    let arena_base_ptr = builder
        .build_int_to_ptr(arena_base_int, ptr_t, "lambda_arena_base_ptr")
        .map_err(|e| LlvmError::Codegen(format!("lambda arena_base inttoptr: {e}")))?;

    // Stash the captures_ptr LLVM param (param 1) so the self-recursion
    // fast path in `emit_call_closure` can reuse it directly instead
    // of round-tripping through a `captures_ptr` field load on every
    // recursion. The lambda signature pins this to LLVM param 1 (param
    // 0 is `*state`) — see `declare_lambda_function`.
    let captures_ptr_param = llvm_fn
        .get_nth_param(1)
        .ok_or_else(|| {
            LlvmError::Codegen(format!("lambda `{}` missing captures_ptr param", func.name))
        })?
        .into_int_value();

    let mut emit = Emit::new(
        ctx,
        &builder,
        module,
        llvm_fn,
        EntryShape::LegacyI64,
        Some(arena_base_ptr),
        Some(state_param),
        /*buffer_return_size=*/ 0,
        const_pool,
    );
    // LLVM param 0 is `*state`; the IR's params (including the
    // implicit captures_ptr at IR index 0) start at LLVM param 1.
    emit.param_base = 1;
    emit.helper_table = Some(helper_table.clone());
    emit.closure_fn_table = closure_fn_table.to_vec();
    // The lambda body's `Op::Return` carries the IR-declared return
    // type so the dispatcher knows what LLVM `ret` shape to emit.
    emit.helper_ret_ty = Some(func.ret);
    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
    emit.self_capture_offsets = self_capture_offsets.to_vec();
    emit.known_capture_offsets = known_capture_offsets.to_vec();
    emit.captures_ptr_param = Some(captures_ptr_param);
    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
    emit.lower_body(&func.body)?;
    Ok(())
}

/// Phase D.1: emit a typed `(i64, i64, ...) -> i64` fast entry
/// alongside the buffer-protocol entry. Reuses the IR body's op
/// stream but rewrites every buffer-protocol `LoadField` into a
/// direct LLVM param read (via `profile.arg_offsets`) and every
/// trailing `StoreField` at the return-value offset into a `ret`
/// against the stashed value.
///
/// Returns `Err` when the IR contains ops outside the fast-path
/// envelope (string ops, sandbox traps, pointer-indirect StoreField,
/// stdlib calls — anything that escapes the simple Int-arithmetic
/// loop). The evaluator side surfaces this as "fast path unavailable;
/// fall back to the buffer entry" rather than a hard error so adding
/// more workloads doesn't risk regressing the buffer path.
pub(crate) fn emit_fast_entry<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    profile: &FastPathProfile,
    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
    closure_fn_table: &[FunctionValue<'ctx>],
) -> Result<FunctionValue<'ctx>, LlvmError> {
    if !is_buffer_protocol_signature(&func.params, func.ret) {
        return Err(LlvmError::UnsupportedSignature(
            "fast-path entry requires buffer-protocol IR".into(),
        ));
    }
    let arity = profile.arg_offsets.len();
    if arity > 8 {
        // Cap at 8 to keep the typed dispatch table in evaluator.rs
        // finite. Sources with arity > 8 stay on the buffer path —
        // their boundary cost is amortised across more work anyway.
        return Err(LlvmError::UnsupportedSignature(format!(
            "fast-path entry: arity {arity} exceeds cap of 8"
        )));
    }

    let i64_t = ctx.i64_type();
    let param_types: Vec<BasicMetadataTypeEnum<'ctx>> = (0..arity).map(|_| i64_t.into()).collect();
    let fn_type = i64_t.fn_type(&param_types, false);
    let llvm_fn = module.add_function(ENTRY_SYMBOL_FAST, fn_type, None);

    let entry_bb = ctx.append_basic_block(llvm_fn, "fast_entry");
    let builder = ctx.create_builder();
    builder.position_at_end(entry_bb);

    // Reserve an alloca for the return value. The fast emitter
    // rewrites the trailing `StoreField` / `StoreFieldAtRecord` at
    // the return slot (which under buffer protocol writes the i64
    // result into the arena) to a store into this slot; the implicit
    // `Op::Return` at end-of-body loads from the slot and `ret`s it.
    // Placing the alloca in the entry block lets LLVM's mem2reg
    // promote it to SSA across the loop boundary.
    let ret_slot = builder
        .build_alloca(i64_t, "fast_ret_slot")
        .map_err(|e| LlvmError::Codegen(format!("fast ret_slot alloca: {e}")))?;
    // Initialise to 0 so any early `Op::Return` (no value path) still
    // produces a defined value — matches the buffer entry's
    // "ret root_size when no scalar stored" envelope.
    builder
        .build_store(ret_slot, i64_t.const_zero())
        .map_err(|e| LlvmError::Codegen(format!("fast ret_slot init: {e}")))?;

    // The fast entry is a typed `(i64...) -> i64` shape derived from
    // the buffer-protocol IR after the dispatch-boundary rewrite. It
    // doesn't touch the const-data pool (the IR only contains scalar
    // arithmetic ops) so we hand it an empty pool to keep
    // `Emit::new` polymorphic.
    let empty_pool = ConstPool::default();
    let mut emit = Emit::new(
        ctx,
        &builder,
        module,
        llvm_fn,
        EntryShape::LegacyI64,
        /*arena_base_ptr=*/ None,
        /*state_ptr=*/ None,
        /*buffer_return_size=*/ 0,
        &empty_pool,
    );
    emit.fast_path = Some(FastEmit {
        profile: profile.clone(),
        ret_slot,
    });
    // LLVM param i corresponds to arg i — no implicit state slot for
    // the fast entry. `LocalGet` should never appear in the body
    // because the IR producer only emits LocalGet for the handshake
    // params (which the fast path doesn't pass).
    emit.param_base = 0;
    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
    // Phase D.2: plumb the module-wide helper and closure tables so
    // an in-body `Op::Call` / `Op::MakeClosure` / `Op::CallClosure`
    // can resolve sibling functions. The fast emitter's per-op rewrites
    // (`MakeClosure` → virtualised closure, `CallClosure` → direct
    // call with null state/captures) consult these tables to pick the
    // matching `FunctionValue`.
    emit.helper_table = Some(helper_table.clone());
    emit.closure_fn_table = closure_fn_table.to_vec();
    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
    emit.lower_body(&func.body)?;

    // The buffer-protocol IR ends with `Op::Return` which the fast
    // emitter rewrote into a load+ret. If the body fell through
    // without an explicit Return (shouldn't happen for well-formed
    // `#main` IR, but be defensive), seal it with a load+ret.
    if let Some(cur) = builder.get_insert_block() {
        if cur.get_terminator().is_none() {
            let v = builder
                .build_load(i64_t, ret_slot, "fast_ret_load")
                .map_err(|e| LlvmError::Codegen(format!("fast trailing load: {e}")))?
                .into_int_value();
            builder
                .build_return(Some(&v))
                .map_err(|e| LlvmError::Codegen(format!("fast trailing ret: {e}")))?;
        }
    }

    Ok(llvm_fn)
}

// ---------------------------------------------------------------------------
// Legacy-i64 entry (Phase A bootstrap envelope, retained for tests)
// ---------------------------------------------------------------------------

fn emit_legacy_entry_with_helpers<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
    imports: &[relon_ir::ir::NativeImport],
    world_mode: WorldMode,
) -> Result<FunctionValue<'ctx>, LlvmError> {
    emit_legacy_entry_impl(ctx, module, func, Some(helper_table), imports, world_mode)
}

/// Emit a Phase-A `(I64...) -> I64` function. Used by tests + the
/// Phase A bootstrap benchmarks that exercise the hand-built IR
/// fixtures directly (no buffer-protocol wrapping).
fn emit_legacy_entry_impl<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    helper_table: Option<&HashMap<u32, FunctionValue<'ctx>>>,
    imports: &[relon_ir::ir::NativeImport],
    world_mode: WorldMode,
) -> Result<FunctionValue<'ctx>, LlvmError> {
    for (i, p) in func.params.iter().enumerate() {
        if *p != IrType::I64 {
            return Err(LlvmError::UnsupportedSignature(format!(
                "llvm-aot: legacy-i64 envelope expects I64 param at #{i}, got {p:?}"
            )));
        }
    }
    if func.ret != IrType::I64 {
        return Err(LlvmError::UnsupportedSignature(format!(
            "llvm-aot: legacy-i64 envelope expects I64 return, got {:?}",
            func.ret
        )));
    }

    let i64_t = ctx.i64_type();
    let param_types: Vec<BasicMetadataTypeEnum<'ctx>> =
        (0..func.params.len()).map(|_| i64_t.into()).collect();
    let fn_type = i64_t.fn_type(&param_types, false);
    let llvm_fn = module.add_function(ENTRY_SYMBOL, fn_type, None);

    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
    let builder = ctx.create_builder();
    builder.position_at_end(entry_bb);

    // Legacy-i64 entry shape only consumes the hand-built fixtures
    // (helloworld_arith) which never reference ConstString — an empty
    // pool is enough.
    let empty_pool = ConstPool::default();
    let mut emit = Emit::new(
        ctx,
        &builder,
        module,
        llvm_fn,
        EntryShape::LegacyI64,
        None,
        None,
        /*buffer_return_size=*/ 0,
        &empty_pool,
    );
    // Param order under the legacy envelope: every IR LocalGet(i)
    // maps to llvm_fn.param(i) — no implicit state slot.
    emit.param_base = 0;
    if let Some(table) = helper_table {
        emit.helper_table = Some(table.clone());
    }
    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
    // Stage 1.B: closed-world legacy entry threads the `#native` import
    // table + pre-declares each host fn as an `extern` so `CallNative`
    // emits a direct `call @<host_symbol>` (no state pointer needed).
    // The open-world legacy path keeps `imports` empty (the legacy
    // fixtures never carry a `CallNative`).
    emit.imports = imports;
    emit.world_mode = world_mode;
    if matches!(world_mode, WorldMode::ClosedWorld) {
        for import in imports {
            declare_host_fn_direct(ctx, module, import);
        }
    }
    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
    emit.lower_body(&func.body)?;

    Ok(llvm_fn)
}

// ---------------------------------------------------------------------------
// Buffer-protocol entry (Phase B production envelope)
// ---------------------------------------------------------------------------

// Retained for symmetry with `emit_legacy_entry_with_helpers`; the
// Phase F.W7 emit path always routes through
// `emit_buffer_entry_with_helpers_and_closures` so a closure-free
// module still gets the new entry shape (with an empty closure
// table). Marked `#[allow(dead_code)]` to keep the symmetric pair
// visible without firing the unused-function lint.
#[allow(dead_code)]
fn emit_buffer_entry_with_helpers<'ctx>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    buffer_return_size: u32,
    const_pool: &ConstPool,
    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
) -> Result<FunctionValue<'ctx>, LlvmError> {
    emit_buffer_entry_impl(
        ctx,
        module,
        func,
        buffer_return_size,
        const_pool,
        Some(helper_table),
        &[],
        &[],
        WorldMode::OpenWorld,
        crate::CodegenTarget::Native,
        &[],
    )
}

/// Phase F.W7 variant: same as [`emit_buffer_entry_with_helpers`] but
/// also threads the closure function-pointer table into the entry's
/// `Emit` so the body's `Op::MakeClosure` lowering can stamp the
/// matching `fn_table_idx` into the closure handle.
#[allow(clippy::too_many_arguments)]
fn emit_buffer_entry_with_helpers_and_closures<'ctx, 'cp>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    buffer_return_size: u32,
    const_pool: &'cp ConstPool,
    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
    closure_fn_table: &[FunctionValue<'ctx>],
    imports: &'cp [relon_ir::ir::NativeImport],
    world_mode: WorldMode,
    target: crate::CodegenTarget,
    effectful_imports: &'cp [bool],
) -> Result<FunctionValue<'ctx>, LlvmError> {
    emit_buffer_entry_impl(
        ctx,
        module,
        func,
        buffer_return_size,
        const_pool,
        Some(helper_table),
        closure_fn_table,
        imports,
        world_mode,
        target,
        effectful_imports,
    )
}

/// Emit the buffer-protocol entry function. The cranelift backend's
/// equivalent lives in `relon-codegen-cranelift::codegen::mod.rs` —
/// signature mirrored here so a host that holds either evaluator
/// can dispatch through the same `(state, in_ptr, …)` argv shape.
#[allow(clippy::too_many_arguments)]
fn emit_buffer_entry_impl<'ctx, 'cp>(
    ctx: &'ctx Context,
    module: &LlvmModule<'ctx>,
    func: &Func,
    buffer_return_size: u32,
    const_pool: &'cp ConstPool,
    helper_table: Option<&HashMap<u32, FunctionValue<'ctx>>>,
    closure_fn_table: &[FunctionValue<'ctx>],
    imports: &'cp [relon_ir::ir::NativeImport],
    world_mode: WorldMode,
    target: crate::CodegenTarget,
    effectful_imports: &'cp [bool],
) -> Result<FunctionValue<'ctx>, LlvmError> {
    let i32_t = ctx.i32_type();
    let i64_t = ctx.i64_type();
    let ptr_t = ctx.ptr_type(AddressSpace::default());

    // (*state, i32 in_ptr, i32 in_len, i32 out_ptr, i32 out_cap, i64 caps) -> i32
    let param_types: Vec<BasicMetadataTypeEnum<'ctx>> = vec![
        ptr_t.into(),
        i32_t.into(),
        i32_t.into(),
        i32_t.into(),
        i32_t.into(),
        i64_t.into(),
    ];
    let fn_type = i32_t.fn_type(&param_types, false);
    let llvm_fn = module.add_function(ENTRY_SYMBOL, fn_type, None);

    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
    let builder = ctx.create_builder();
    builder.position_at_end(entry_bb);

    // Resolve the per-call arena base once at function entry. The
    // LoadField / StoreField helpers consume this cached value so
    // the JIT doesn't reload `state->arena_base` on every access.
    let state_param = llvm_fn
        .get_nth_param(0)
        .ok_or_else(|| LlvmError::Codegen("buffer entry missing state param".into()))?
        .into_pointer_value();

    // Pointer arithmetic on the state struct: GEP by ARENA_STATE_OFFSET_BASE
    // bytes through an i8 view, then load the `usize` arena base.
    // We use opaque pointers so the GEP element type only matters
    // for the offset calculation.
    let i8_t = ctx.i8_type();
    let arena_base_gep = unsafe {
        builder
            .build_in_bounds_gep(
                i8_t,
                state_param,
                &[i32_t.const_int(ARENA_STATE_OFFSET_BASE as u64, false)],
                "arena_base_gep",
            )
            .map_err(|e| LlvmError::Codegen(format!("arena_base GEP: {e}")))?
    };
    // `arena_base` is `usize`. On every supported host that's i64
    // (we only target x86_64 today; the inkwell feature set in the
    // Cargo.toml is `target-x86`). If we add a 32-bit host the
    // load type needs to follow `pointer_type` width — Phase B
    // assumes the workspace's only target is 64-bit.
    // TODO(P3-wasm32): use DataLayout pointer width instead of i64
    // for the arena-base word load + inttoptr below.
    let arena_base_load = builder
        .build_load(i64_t, arena_base_gep, "arena_base")
        .map_err(|e| LlvmError::Codegen(format!("arena_base load: {e}")))?;
    mark_invariant_load(ctx, arena_base_load);
    let arena_base_int = arena_base_load.into_int_value();
    let arena_base_ptr = builder
        .build_int_to_ptr(arena_base_int, ptr_t, "arena_base_ptr")
        .map_err(|e| LlvmError::Codegen(format!("arena_base inttoptr: {e}")))?;

    // Phase E.1 prologue: init `state.tail_cursor = buffer_return_size`
    // so the first pointer-indirect StoreField lands past the fixed
    // area. Cheap (one store per call) — keeping it unconditional
    // avoids a body pre-scan. Bodies that never touch the tail
    // cursor pay the dead store; mem2reg / DSE eliminate it at -O3.
    let tail_init_gep = unsafe {
        builder
            .build_in_bounds_gep(
                i8_t,
                state_param,
                &[i32_t.const_int(u64::from(ARENA_STATE_OFFSET_TAIL_CURSOR), false)],
                "tail_cursor_init_gep",
            )
            .map_err(|e| LlvmError::Codegen(format!("tail_cursor init GEP: {e}")))?
    };
    let tail_init = i32_t.const_int(u64::from(buffer_return_size), false);
    builder
        .build_store(tail_init_gep, tail_init)
        .map_err(|e| LlvmError::Codegen(format!("tail_cursor init store: {e}")))?;

    let mut emit = Emit::new(
        ctx,
        &builder,
        module,
        llvm_fn,
        EntryShape::Buffer,
        Some(arena_base_ptr),
        Some(state_param),
        buffer_return_size,
        const_pool,
    );
    // Buffer-protocol LocalGet(0..=3) reads the four i32 handshake
    // slots; LocalGet(4) reads the i64 `caps` slot. The state
    // pointer occupies slot 0 in the LLVM function — IR locals
    // start at +1 from there.
    emit.param_base = 1;
    if let Some(table) = helper_table {
        emit.helper_table = Some(table.clone());
    }
    emit.closure_fn_table = closure_fn_table.to_vec();
    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
    // Phase 0b: thread the `#native` import table through so
    // `Op::CallNative` can validate the call shape.
    emit.imports = imports;
    emit.world_mode = world_mode;
    emit.target = target;
    emit.effectful_imports = effectful_imports;
    match world_mode {
        // Open-world (MCJIT / from_source): declare the dynamic-dispatch
        // helper so `Op::CallNative` emits a `call @relon_llvm_call_native`
        // that `add_global_mapping` later resolves to the host address.
        //
        // P3 §2.2: the wasm32 target has no MCJIT engine to patch the
        // helper symbol in — declaring it would leave an unresolvable
        // native import. The wasm path instead lowers each
        // `Op::CallNative` to a direct **wasm import** call
        // (`emit_call_native_wasi`), declaring the import lazily at the
        // call site, so we skip the helper declaration here.
        WorldMode::OpenWorld if matches!(target, crate::CodegenTarget::Wasm32) => {
            emit.call_native_fn = None;
        }
        WorldMode::OpenWorld => {
            emit.call_native_fn = Some(declare_call_native(ctx, module));
        }
        // Closed-world (Stage 1.B LTO co-compile): pre-declare every
        // host fn as an `extern` so `Op::CallNative` can emit a direct
        // `call @<host_symbol>`. The host bitcode is linked + inlined by
        // `crate::cocompile`. No `relon_llvm_call_native` helper exists
        // on this path.
        //
        // P3 §2.2 wasm closed-world: only pre-declare the **pure-compute**
        // host fns as direct externs (those get co-compiled + inlined).
        // An **effectful** host fn must NOT be inlined into the sandbox —
        // its `Op::CallNative` routes to `emit_call_native_wasi`, which
        // declares the `(import "env" …)` lazily. Pre-declaring it here as
        // a plain extern would still be link-resolved by the inlined-shim,
        // defeating the boundary, so we skip effectful imports.
        WorldMode::ClosedWorld => {
            emit.call_native_fn = None;
            for (idx, import) in imports.iter().enumerate() {
                let effectful = effectful_imports.get(idx).copied().unwrap_or(false);
                if !effectful {
                    declare_host_fn_direct(ctx, module, import);
                }
            }
        }
    }
    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
    emit.emit_step_budget_check("entry")?;
    emit.lower_body(&func.body)?;

    Ok(llvm_fn)
}

// ---------------------------------------------------------------------------
// Per-function emitter state
// ---------------------------------------------------------------------------

/// Per-function emitter state. Holds the inkwell builder borrow,
/// the LLVM function the emit targets, the IR's operand stack, and
/// the alloca slots backing `LetSet` / `LetGet`.
///
/// `param_base` accounts for the entry-shape's implicit param slot:
/// the buffer-protocol entry has the `*state` pointer at LLVM param
/// 0, so `LocalGet(0)` resolves to LLVM param 1. The legacy-i64
/// entry has no implicit slot, so `param_base = 0`.
pub(crate) struct Emit<'ctx, 'b, 'cp> {
    pub(crate) ctx: &'ctx Context,
    pub(crate) builder: &'b Builder<'ctx>,
    pub(crate) func: FunctionValue<'ctx>,
    /// Phase F.1: cached module reference so per-op lowering can
    /// declare extern symbols (the F.1 `str.contains` host shim) on
    /// demand without threading the module through every helper. The
    /// reference is borrowed for the emit pass only; `inkwell` keeps
    /// `Module` and `FunctionValue` lifetimes orthogonal so a borrow
    /// here doesn't conflict with the surrounding `add_function`
    /// calls in the entry/helper emit paths.
    pub(crate) module: &'b LlvmModule<'ctx>,
    pub(crate) shape: EntryShape,
    /// Cached `arena_base` pointer for the buffer-protocol entry.
    /// `None` for the legacy entry shape — `LoadField` / `StoreField`
    /// reject themselves before reaching for this value.
    pub(crate) arena_base_ptr: Option<PointerValue<'ctx>>,
    /// Cached state-pointer LLVM value (param 0 of the buffer entry).
    /// Phase E.1 uses it to load / store the per-call tail-cursor /
    /// scratch-cursor / scratch-base slots. `None` outside the
    /// buffer-protocol entry shape.
    pub(crate) state_ptr: Option<PointerValue<'ctx>>,
    /// Operand stack mirroring the IR's virtual stack. Every value
    /// in flight is an LLVM integer of the matching IR type. The
    /// pair tags the IR type so consumers can pick the right
    /// signed / unsigned predicate without re-deriving it.
    pub(crate) stack: Vec<TypedValue<'ctx>>,
    /// `LetSet { idx }` alloca slots, keyed by `(idx, ty)`. Each
    /// idx has at most one type at a time — the IR lowering pass
    /// guarantees no aliasing between idx's of different types.
    pub(crate) let_slots: std::collections::HashMap<u32, (PointerValue<'ctx>, IrType)>,
    /// Static let-index floor for stdlib inline-frame windows: the
    /// function body's [`relon_ir::ir::body_let_watermark`], i.e. one
    /// past the highest let index the body (recursively) touches.
    /// `emit_call_stdlib` places each inline window at
    /// `max(declared-slots max + 1, let_floor)` so callee lets never
    /// collide with caller lets that are first bound *after* the
    /// inlined call. While a frame is active the floor is raised past
    /// the callee body's own watermark (and restored on frame pop) so
    /// nested inlines stay collision-free too.
    pub(crate) let_floor: u32,
    /// LLVM param offset corresponding to `LocalGet(0)`. See
    /// [`Self::lookup_param`] — `param_base + idx` is the LLVM
    /// param index.
    pub(crate) param_base: u32,
    /// Label stack carrying the (entry_bb, exit_bb, kind) of every
    /// nested [`Op::Block`] / [`Op::Loop`]. `Br { label_depth }`
    /// indexes from the back (depth 0 = innermost). `Block`s exit
    /// to their tail; `Loop`s exit to their head.
    pub(crate) label_stack: Vec<LabelFrame<'ctx>>,
    /// Monotonic counter to mint unique LLVM basic block / value
    /// names so the dumped IR is human-readable.
    pub(crate) name_seq: u32,
    /// Phase B: hard-coded `return_root_size` returned from a
    /// buffer-protocol `Op::Return`. The IR producer leaves no
    /// value on the operand stack for `Return` under buffer
    /// protocol — the trampoline reads back `bytes_written` to
    /// decode the output record. We hard-code this to the schema's
    /// `return_layout.root_size`, passed in at emit time.
    pub(crate) buffer_return_size: u32,
    /// Phase D.1: set when emitting the fast-path entry. The
    /// `Op::LoadField` / `Op::StoreField` / `Op::Return` lowering
    /// branches consult this to rewrite the buffer-protocol IR
    /// against the typed `(i64...) -> i64` LLVM signature.
    pub(crate) fast_path: Option<FastEmit<'ctx>>,
    /// Phase E.2 multi-function lookup: when populated, `Op::Call`
    /// with `fn_index >= stdlib_function_count()` resolves to the
    /// matching sibling `FunctionValue` and emits a direct LLVM
    /// `call`. The map is keyed by IR-side `funcs` index (i.e.
    /// `fn_index - stdlib_count`). Empty for hand-built fixtures that
    /// never reference user-defined functions.
    pub(crate) helper_table: Option<HashMap<u32, FunctionValue<'ctx>>>,
    /// Phase E.2: when emitting a helper body (not the entry), this
    /// carries the IR-declared return type so `Op::Return` can pick
    /// the right LLVM `ret` shape. `None` while lowering the entry
    /// body — the entry's return shape is dictated by `EntryShape`.
    pub(crate) helper_ret_ty: Option<IrType>,
    /// Phase E.2: cached `llvm.trap` intrinsic `FunctionValue`. The
    /// intrinsic is declared once per module (in
    /// [`emit_module_funcs`]); each `Emit` snapshots the pointer so
    /// per-op `Div(I64)` / `Mod(I64)` guards can call it without
    /// re-querying the module.
    pub(crate) llvm_trap_fn: Option<FunctionValue<'ctx>>,
    /// Phase E.1: per-module const-data lookup. `Op::ConstString { idx }`
    /// reads the matching offset and pushes `iconst(I32, off)`.
    pub(crate) const_pool: &'cp ConstPool,
    /// Phase E.1: stack of inline call frames. `Op::Call` pushes one
    /// before lowering the callee body; `Op::Return` inside the
    /// callee body pops the typed value into the topmost frame's
    /// result alloca and jumps to its exit block. The callee's
    /// `LocalGet(idx)` resolves to `params[idx]` rather than the
    /// entry's LLVM params; `LetGet/LetSet` indices are remapped
    /// against `let_offset` so concurrent inline frames don't clash.
    pub(crate) inline_frames: Vec<InlineFrame<'ctx>>,
    /// Phase E.1: did the body emit a pointer-indirect StoreField?
    /// When set, the buffer-protocol epilogue returns the post-bump
    /// tail cursor (in bytes past `out_ptr`) rather than the
    /// statically-known `buffer_return_size`. Mirrors cranelift's
    /// `needs_tail_cursor` flag.
    pub(crate) needs_tail_cursor: bool,
    /// In-place region-walk return ABI (S2): set by `emit_store_field`
    /// when the entry returns a `List<List<scalar>>` sourced directly
    /// from a `#main` parameter. Holds the **arena-relative** i32 offset
    /// of the root list header (the value `Op::LoadListListPtr` pushed,
    /// already rebased by `in_ptr`). No bytes are copied into `out_buf`;
    /// instead the buffer epilogue (`emit_return`) encodes this offset as
    /// the negative in-place sentinel `-(root_abs + 1)` and returns it,
    /// telling the host to verify + decode the value in place at its
    /// source region rather than at `out_ptr`. `None` for every other
    /// return shape, which keeps the existing `buffer_return_size` /
    /// tail-cursor epilogue. Mirrors cranelift's `inplace_return_root`.
    pub(crate) inplace_return_root: Option<IntValue<'ctx>>,
    /// Phase F.W7: ordered list of lambda `FunctionValue`s, indexed by
    /// `fn_table_idx`. `Op::MakeClosure { fn_table_idx }` stamps the
    /// matching index into the closure handle's `fn_table_idx` slot
    /// and uses the same lookup to resolve the function pointer to
    /// stash. `Op::CallClosure` reads the handle's `fn_table_idx`
    /// slot and dispatches indirectly through a private global table
    /// of function pointers seeded from this list. Empty when the
    /// module contains no lambdas.
    pub(crate) closure_fn_table: Vec<FunctionValue<'ctx>>,
    /// Phase F.W7: per-IR-`record_local_idx` allocas backing
    /// `Op::AllocRootRecord` / `Op::StoreFieldAtRecord`. The slot
    /// holds an i32 out_ptr-relative offset; `AllocRootRecord` writes
    /// `0` there (root sits at `out_ptr + 0`), `StoreFieldAtRecord`
    /// reads it back to compute the destination address. Mirrors
    /// cranelift's `record_locals` map.
    pub(crate) record_locals: std::collections::HashMap<u32, PointerValue<'ctx>>,
    /// Phase H: bytes literal pushed by the *immediately preceding*
    /// `Op::ConstString` op (i.e. still the top-of-stack at the start
    /// of the next `lower_op` call). Cleared at the start of every
    /// `lower_op` and re-populated by the `Op::ConstString` arm at
    /// its tail. The `Op::Call` arm reads this when `fn_index ==
    /// STDLIB_IDX_CONTAINS` to detect the const-needle case and
    /// inline a tight byte-scan loop, skipping the
    /// `relon_llvm_str_contains_arena` extern shim's FFI boundary
    /// (~10-15 cycles of prologue/epilogue per call on x86_64). On
    /// the W4 / W4_long hot loops the needle is always a
    /// compile-time const (`"x"`), so the const-needle fast path
    /// fires 100% of iters. Stays `None` when the needle came in via
    /// `LocalGet` / `LetGet` / any non-`ConstString` producer — those
    /// fall through to the existing extern path.
    pub(crate) last_const_string: Option<Vec<u8>>,
    /// Phase F.W7 self-recursion fast path: per-lambda map of captures
    /// struct offsets that hold a self-recursive closure handle, keyed
    /// by the `fn_table_idx` of the enclosing lambda. Populated only
    /// for lambda bodies (the entry / helpers leave it empty); the
    /// scanner in `build_self_capture_table` correlates each
    /// `Op::MakeClosure` in the entry with the immediately following
    /// `LetSet { idx, ty: Closure }` to identify captures whose
    /// `cap.let_idx == idx` (i.e. the binding being assigned right
    /// after MakeClosure — the canonical IR shape for a self-recursive
    /// closure-as-value let). The value `Vec<(offset,
    /// self_fn_table_idx)>` lets the lambda-prologue `Op::LocalGet(0);
    /// Op::LoadI32AtAbsolute { offset }` chain stamp the matching
    /// [`Provenance::OwnCaptureHandle`] on the produced handle so the
    /// downstream `Op::CallClosure` can pick the direct-call fast path
    /// (skip handle deref, skip switch, reuse the lambda's own
    /// captures_ptr LLVM param 1). Empty when the lambda has no
    /// self-recursive captures or when self-recursion detection is
    /// unavailable (legacy / fixture entries that bypass the
    /// MakeClosure → LetSet pattern).
    pub(crate) self_capture_offsets: Vec<(u32, u32)>,
    /// Phase F.W7 self-recursion fast path: let-slot indices that hold
    /// a self-recursive closure handle along with the enclosing
    /// lambda's `fn_table_idx`. Populated by `Op::LetSet` when the
    /// stored value carries [`Provenance::OwnCaptureHandle`] so the
    /// matching `Op::LetGet` can re-emit the provenance — this is what
    /// lets the recursive `fib(k - 1)` call site (which always goes
    /// through `LetGet`) keep the self-recursion fast path intact.
    pub(crate) self_capture_let_slots: std::collections::HashMap<u32, (u32, u32)>,
    /// Phase F.W7 self-recursion fast path: captures_ptr LLVM param
    /// (param 1) of the enclosing lambda. Cached so the closure-call
    /// emitter can pass it straight into the recursive call without
    /// re-loading from the closure handle. `None` when emitting the
    /// entry / a helper (not a lambda body) — the self-recursion fast
    /// path is gated on this being `Some`.
    pub(crate) captures_ptr_param: Option<IntValue<'ctx>>,
    /// Phase D.2 fast-path entry: let-slot indices holding a
    /// virtualised closure stamped by an in-body `Op::MakeClosure`
    /// (carries `Provenance::FastPathClosure`). The `LetSet` that
    /// catches such a value stashes the `fn_table_idx` here so the
    /// matching `LetGet` can re-emit the provenance, keeping the
    /// `CallClosure` direct-call rewrite alive across the let chain.
    /// Empty when not emitting the fast-path entry.
    pub(crate) fast_path_closure_let_slots: std::collections::HashMap<u32, u32>,
    /// Phase L W3: let-slot indices holding a `Provenance::ConstString`
    /// value (i.e. the let was set from a value sourced — directly or
    /// via prior `LetGet` chains — from an `Op::ConstString`). The
    /// matching `LetGet` re-stamps the provenance so the downstream
    /// `Op::Add(String)` lowering can switch to the const-len /
    /// single-byte-store fast path. Each entry records (len, optional
    /// first_byte). Empty by default; entries survive only across
    /// inner-loop iterations because the W3 reduce shape's `s` let is
    /// re-set every iteration from the same const literal.
    pub(crate) const_string_let_slots: std::collections::HashMap<u32, (u32, Option<u8>)>,
    /// Devirtualisation (W18): let-slot indices holding a real
    /// arena-resident closure handle whose `fn_table_idx` is a
    /// compile-time constant (`Provenance::KnownClosure`). The `LetSet`
    /// that catches such a value stashes the `fn_table_idx` here so the
    /// matching `LetGet` re-stamps the provenance, letting the downstream
    /// `CallClosure` emit a direct call (LLVM inlines it) instead of the
    /// runtime `switch i32 %cc_fn_idx`. A non-known-closure `LetSet`
    /// against the same slot wipes the entry so a later `LetGet` cannot
    /// fraudulently claim a static target. Empty by default.
    pub(crate) known_closure_let_slots: std::collections::HashMap<u32, u32>,
    /// Devirtualisation (W18): `(capture_offset, captured_fn_table_idx)`
    /// pairs for the lambda body currently being emitted, identifying
    /// captures-struct offsets that hold a handle produced by a literal
    /// `MakeClosure` with a compile-time-constant `fn_table_idx` (a
    /// *known* closure that is NOT a self-capture). The capture-load
    /// prologue (`LocalGet(0); LoadI32AtAbsolute { offset }`) stamps
    /// [`Provenance::KnownClosure`] on the matching load so a body
    /// `CallClosure` against the capture emits a direct call. Seeded by
    /// [`build_known_capture_table`]; empty when emitting the entry /
    /// helpers or a lambda with no such captures.
    pub(crate) known_capture_offsets: Vec<(u32, u32)>,
    /// Phase 0b native dispatch: the module's `#native` imports, in
    /// `import_idx` order. `Op::CallNative` validates the call's
    /// `import_idx` / param-shape / ret-ty against this table before
    /// emitting the dispatch (mirrors cranelift's `self.ir.imports`
    /// check). Empty for hand-built fixtures / fast / helper / lambda
    /// emits — those never carry a `CallNative`, so the validation arm
    /// surfaces a precise `Codegen` error if one slips through.
    pub(crate) imports: &'cp [relon_ir::ir::NativeImport],
    /// Phase 0b native dispatch: the declared `relon_llvm_call_native`
    /// helper `FunctionValue`. `Op::CallNative` emits a `call` against
    /// it. `None` outside the buffer-protocol entry (the only shape
    /// that carries a `*state` pointer to thread through).
    pub(crate) call_native_fn: Option<FunctionValue<'ctx>>,
    /// Stage 1.B: open-world (dynamic helper) vs closed-world (static
    /// direct `call @<host_symbol>`) native dispatch. Defaults to
    /// [`WorldMode::OpenWorld`] so MCJIT / `from_source` are untouched;
    /// only `crate::cocompile` flips it to `ClosedWorld`.
    pub(crate) world_mode: WorldMode,
    /// P3 §2.2: the codegen target. Defaults to
    /// [`CodegenTarget::Native`]; only the wasm32 object-emit path flips
    /// it to [`CodegenTarget::Wasm32`]. On wasm32 an open-world
    /// `Op::CallNative` lowers to a **wasm import** call (see
    /// [`crate::wasi_host`]) instead of the native MCJIT
    /// `relon_llvm_call_native` helper, which the sandbox cannot reach.
    pub(crate) target: crate::CodegenTarget,
    /// P3 §2.2 wasm closed-world routing: per-`import_idx` effectful flag.
    /// `effectful_imports[i] == true` means the host fn at import index
    /// `i` is capability-gated (a preceding `Op::CheckCap` guards its
    /// call) — an *effectful* fn that must cross the sandbox boundary as a
    /// **WASI import**, not be inlined into the wasm unit. `false` (or an
    /// out-of-range index on the legacy / native paths) means pure-compute:
    /// co-compile + inline. Empty slice on every path except wasm32
    /// closed-world; the wasm closed-world emit
    /// (`emit_module_funcs_closed_world_wasm`) populates it from the IR's
    /// CheckCap → CallNative shape.
    pub(crate) effectful_imports: &'cp [bool],
}

/// Phase E.1: per-call inline-frame state. One entry per active
/// stdlib `Op::Call`; the callee body lowers against the topmost
/// frame.
pub(crate) struct InlineFrame<'ctx> {
    /// LLVM values bound to the callee's `LocalGet(0..arity)` reads.
    /// Order matches the IR's declared parameter order — the
    /// `Op::Call` site popped them from the caller's operand stack
    /// (top-of-stack = last param) and reversed.
    pub(crate) params: Vec<TypedValue<'ctx>>,
    /// Offset added to the callee's `LetGet/LetSet` indices so its
    /// let-bindings don't alias the caller's slots. Mirrors the
    /// cranelift backend's `let_offset`.
    pub(crate) let_offset: u32,
    /// Result alloca + exit basic block. The callee's `Op::Return`
    /// stores the popped value into the alloca and unconditionally
    /// branches to `exit_bb`; the caller continues from there with a
    /// matching load.
    pub(crate) ret_slot: PointerValue<'ctx>,
    /// LLVM type stored at [`Self::ret_slot`]. Pre-computed from the
    /// IR-declared `ret_ty` of the stdlib call so the caller-side
    /// load knows what width to read.
    pub(crate) ret_ty: IrType,
    /// Branch target for `Op::Return` inside the callee body. The
    /// caller positions the builder here after the inline finishes
    /// and pushes the loaded return value back onto the operand
    /// stack.
    pub(crate) exit_bb: inkwell::basic_block::BasicBlock<'ctx>,
}

/// Phase D.1 fast-path emission state. Carried inside [`Emit`] when
/// lowering the typed fast entry.
#[derive(Clone)]
pub(crate) struct FastEmit<'ctx> {
    pub(crate) profile: FastPathProfile,
    /// Alloca holding the i64 return value. Trailing `StoreField`
    /// at `profile.ret_offset` writes into this slot; `Op::Return`
    /// loads from it.
    pub(crate) ret_slot: PointerValue<'ctx>,
}

#[derive(Clone, Copy)]
pub(crate) struct TypedValue<'ctx> {
    pub(crate) val: IntValue<'ctx>,
    /// IR-level tag of `val`. Recorded so Phase C predicates that
    /// inspect operand types (signed-vs-unsigned cmp, F64 routing)
    /// have it on hand without re-deriving from LLVM bit width.
    /// Phase B never consumes this field; `#[allow(dead_code)]`
    /// keeps the lint clean while we're still wiring future Op
    /// support.
    #[allow(dead_code)]
    pub(crate) ty: IrType,
    /// Provenance hint used by [`Emit::emit_call_closure`] to detect
    /// self-recursive closure calls. Defaults to [`Provenance::None`]
    /// for every push that doesn't go through the lambda-prologue
    /// capture path; the closure-self-call fast path only fires when
    /// the consumed handle's provenance points at one of the lambda's
    /// own self-capture offsets.
    pub(crate) prov: Provenance,
}

/// Tracks where an [`IntValue`] on the operand stack came from so the
/// closure-call emitter can detect self-recursion without re-loading
/// the handle's captures pointer through arena indirection.
///
/// The W7 production source's `fib` closure captures itself, so every
/// recursive `fib(k - 1)` call site walks
/// `captures_ptr -> self_handle -> captures_ptr_field -> direct call`.
/// LLVM cannot fold the `captures_ptr_field` load back to the input
/// `captures_ptr` because the chain crosses `MakeClosure` in another
/// function (no IPA reach), so a pure post-O3 IR ends up with three
/// arena loads per recursion (`~10 ns/call ≈ +170 µs` over `fib(22)`).
///
/// The provenance bits below are enough to short-circuit:
///
/// * `OwnCapturesPtr` — the value is the lambda's own captures_ptr arg
///   (LLVM param 1). Produced by `Op::LocalGet(0)` inside a lambda.
/// * `OwnCaptureHandle { offset, self_fn_table_idx }` — the value is a
///   closure handle loaded from `captures_ptr + offset` and the
///   matching `MakeClosure` capture is self-recursive (handle points
///   back at the enclosing lambda whose `fn_table_idx ==
///   self_fn_table_idx`). Lets `Op::CallClosure` emit a direct call to
///   `closure_fn_table[self_fn_table_idx]` with the current
///   `captures_ptr` arg — no handle deref, no switch, no trap branch.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum Provenance {
    None,
    /// LLVM param 1 of the enclosing lambda — the captures_ptr arg.
    OwnCapturesPtr,
    /// Closure handle loaded from `captures_ptr + offset`; the matching
    /// MakeClosure capture is self-recursive, so the handle's
    /// `captures_ptr` field equals `OwnCapturesPtr` and the handle's
    /// `fn_table_idx` equals `self_fn_table_idx`.
    OwnCaptureHandle {
        #[allow(dead_code)]
        offset: u32,
        self_fn_table_idx: u32,
    },
    /// Phase D.2: closure handle materialised by a `MakeClosure` op
    /// inside the fast-path entry. The fast entry has no arena/state,
    /// so `MakeClosure` cannot bump-allocate the 8-byte handle record;
    /// instead the value is virtualised — we remember the
    /// `fn_table_idx` and rewrite the matching `CallClosure` into a
    /// direct call against the lambda function. The lambda's
    /// `(state, captures_ptr, args...)` signature is satisfied by
    /// passing null / zero for state / captures, which is sound for
    /// W7-style self-recursive closures whose post-O3 body drops
    /// both args.
    FastPathClosure {
        fn_table_idx: u32,
    },
    /// Devirtualisation (W18, 2026-05-30): the IntValue is a *real*
    /// arena-resident closure handle (`[fn_table_idx][captures_ptr]`)
    /// produced by a literal [`Op::MakeClosure`] whose `fn_table_idx` is
    /// a compile-time constant. Unlike [`Self::FastPathClosure`] the
    /// handle is fully materialised in the arena (the buffer-protocol
    /// entry has state + arena), so the matching `CallClosure` still
    /// loads the real `captures_ptr` from `handle + 4` — it only skips
    /// the runtime `switch i32 %cc_fn_idx` over `handle + 0`, because the
    /// handle's `fn_table_idx` word is *provably* this constant.
    ///
    /// Soundness: the value flows unmodified from the `MakeClosure` (or a
    /// `LetSet`/`LetGet` round-trip, or an inline-frame argument bind)
    /// to the `CallClosure`; there is exactly one possible callee, so the
    /// switch's runtime selection is statically decided. The slow-path
    /// `build_switch` stays for any handle that did *not* arrive with
    /// this provenance (a genuinely-dynamic dispatch). When the W18
    /// `_list_filter` predicate (a literal `(k) => is_prime(k, 2)`
    /// MakeClosure) is inlined into the bundled `list_int_filter` body,
    /// this lets the per-element predicate dispatch become a direct call
    /// LLVM then inlines, killing the hot-loop switch.
    KnownClosure {
        fn_table_idx: u32,
    },
    /// Phase L W3 (2026-05-28): the IntValue is an i32 arena offset to a
    /// `[len:u32 LE][payload]` String record whose payload was placed in
    /// the const-pool prefix at module build time, so its length is
    /// known at compile time. Carried by `Op::ConstString` and
    /// propagated through `Op::LetSet { ty: String }` →
    /// `Op::LetGet { ty: String }` so `Op::Add(String)` can feed the
    /// const length to LLVM (memcpy intrinsic with const size lowers
    /// to inline stores) and skip the per-iter `[len]` header reload.
    ///
    /// Single-byte payloads (the W3 reduce hot loop's `"a"`) further
    /// expose `first_byte` so the in-place fast path can emit a single
    /// `i8 store` instead of `memcpy` — bypassing the LLVM lowering
    /// pass altogether for the dominant reduce shape.
    ConstString {
        len: u32,
        /// `Some(byte)` when `len == 1` so the lowering can emit an
        /// inline `store i8 byte, dst` instead of a memcpy intrinsic.
        /// `None` for longer payloads (LLVM's memcpy intrinsic
        /// lowering still handles those well once the size is const).
        first_byte: Option<u8>,
    },
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum LabelKind {
    /// `Br` jumps **past** the block (forward exit).
    Block,
    /// `Br` jumps **back** to the loop header (continue).
    Loop,
}

#[derive(Clone, Copy)]
pub(crate) struct LabelFrame<'ctx> {
    /// Header basic block. For Block this is unused for branching
    /// (we never branch backward to the start of a block); for Loop
    /// it's the target of a `Br` (continue).
    pub(crate) header_bb: inkwell::basic_block::BasicBlock<'ctx>,
    /// Tail basic block — what code after the block / after the
    /// loop falls through to. For Block this is the `Br` target;
    /// for Loop the surrounding code lives here.
    pub(crate) tail_bb: inkwell::basic_block::BasicBlock<'ctx>,
    pub(crate) kind: LabelKind,
}

impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        ctx: &'ctx Context,
        builder: &'b Builder<'ctx>,
        module: &'b LlvmModule<'ctx>,
        func: FunctionValue<'ctx>,
        shape: EntryShape,
        arena_base_ptr: Option<PointerValue<'ctx>>,
        state_ptr: Option<PointerValue<'ctx>>,
        buffer_return_size: u32,
        const_pool: &'cp ConstPool,
    ) -> Self {
        Self {
            ctx,
            builder,
            func,
            module,
            shape,
            arena_base_ptr,
            state_ptr,
            stack: Vec::with_capacity(8),
            let_slots: std::collections::HashMap::new(),
            let_floor: 0,
            param_base: 0,
            label_stack: Vec::new(),
            name_seq: 0,
            buffer_return_size,
            fast_path: None,
            helper_table: None,
            helper_ret_ty: None,
            llvm_trap_fn: None,
            const_pool,
            inline_frames: Vec::new(),
            needs_tail_cursor: false,
            inplace_return_root: None,
            last_const_string: None,
            closure_fn_table: Vec::new(),
            record_locals: std::collections::HashMap::new(),
            self_capture_offsets: Vec::new(),
            self_capture_let_slots: std::collections::HashMap::new(),
            captures_ptr_param: None,
            fast_path_closure_let_slots: std::collections::HashMap::new(),
            const_string_let_slots: std::collections::HashMap::new(),
            known_closure_let_slots: std::collections::HashMap::new(),
            known_capture_offsets: Vec::new(),
            imports: &[],
            call_native_fn: None,
            world_mode: WorldMode::OpenWorld,
            target: crate::CodegenTarget::Native,
            effectful_imports: &[],
        }
    }

    pub(crate) fn next_name(&mut self, hint: &str) -> String {
        self.name_seq += 1;
        format!("{hint}_{}", self.name_seq)
    }

    // -- stack helpers --------------------------------------------------

    pub(crate) fn push(&mut self, v: IntValue<'ctx>, ty: IrType) {
        self.stack.push(TypedValue {
            val: v,
            ty,
            prov: Provenance::None,
        });
    }

    /// Push a value while attaching a [`Provenance`] tag. Currently
    /// only emitted by the lambda-prologue capture path
    /// (`LocalGet(0)` → `LoadI32AtAbsolute` → `LetSet/LetGet`) so
    /// `emit_call_closure` can short-circuit self-recursive calls.
    pub(crate) fn push_with_prov(&mut self, v: IntValue<'ctx>, ty: IrType, prov: Provenance) {
        self.stack.push(TypedValue { val: v, ty, prov });
    }

    /// Phase F.W7 self-recursion fast path: peek the operand stack's
    /// top-of-stack provenance without consuming it and return the
    /// matching [`Provenance::OwnCaptureHandle`] when the top is the
    /// lambda's captures_ptr and `offset` matches a recorded self-
    /// recursive capture offset. Returns `None` otherwise — the
    /// caller then leaves the produced value's provenance at
    /// [`Provenance::None`] and the closure-call emitter falls back
    /// to the slow-path switch dispatch.
    ///
    /// Caller uses this **after** `emit_load_at_absolute` pops the
    /// base; we read the stack top here before that pop runs, so
    /// the lookup remains correct (the base is still on top when
    /// the dispatcher arm fires).
    pub(crate) fn peek_self_capture_provenance(&self, offset: u32) -> Option<Provenance> {
        let top = self.stack.last()?;
        if !matches!(top.prov, Provenance::OwnCapturesPtr) {
            return None;
        }
        // Self-recursive capture wins (its `captures_ptr`-reuse direct
        // path is strictly cheaper than re-loading the handle's
        // captures_ptr field).
        for (cap_offset, self_fn_table_idx) in &self.self_capture_offsets {
            if *cap_offset == offset {
                return Some(Provenance::OwnCaptureHandle {
                    offset,
                    self_fn_table_idx: *self_fn_table_idx,
                });
            }
        }
        // Devirtualisation (W18): a capture of a known (non-self)
        // closure. Stamp `KnownClosure` so the body's `CallClosure`
        // against the capture emits a direct call (still loading the
        // capture's own captures_ptr) instead of the runtime switch.
        for (cap_offset, captured_fn_table_idx) in &self.known_capture_offsets {
            if *cap_offset == offset {
                return Some(Provenance::KnownClosure {
                    fn_table_idx: *captured_fn_table_idx,
                });
            }
        }
        None
    }

    pub(crate) fn pop(&mut self, ip_hint: &str) -> Result<TypedValue<'ctx>, LlvmError> {
        self.stack.pop().ok_or_else(|| {
            LlvmError::Codegen(format!(
                "operand stack underflow at {ip_hint}: producer emitted an Op with no matching push"
            ))
        })
    }

    pub(crate) fn pop_int(&mut self, ip_hint: &str) -> Result<IntValue<'ctx>, LlvmError> {
        self.pop(ip_hint).map(|tv| tv.val)
    }

    // -- locals / lets --------------------------------------------------

    pub(crate) fn lookup_param(&self, idx: u32) -> Result<IntValue<'ctx>, LlvmError> {
        let llvm_idx = self
            .param_base
            .checked_add(idx)
            .ok_or_else(|| LlvmError::Codegen(format!("LocalGet({idx}): param idx overflow")))?;
        let p = self.func.get_nth_param(llvm_idx).ok_or_else(|| {
            LlvmError::Codegen(format!(
                "LocalGet({idx}) -> llvm param #{llvm_idx} out of range; function has {} param(s)",
                self.func.count_params()
            ))
        })?;
        match p {
            BasicValueEnum::IntValue(v) => Ok(v),
            other => Err(LlvmError::Codegen(format!(
                "LocalGet({idx}) llvm param #{llvm_idx} is {other:?}, expected IntValue"
            ))),
        }
    }

    pub(crate) fn ensure_let_slot(
        &mut self,
        idx: u32,
        ty: IrType,
    ) -> Result<PointerValue<'ctx>, LlvmError> {
        if let Some((ptr, existing_ty)) = self.let_slots.get(&idx) {
            if *existing_ty != ty {
                return Err(LlvmError::Codegen(format!(
                    "let-slot {idx} aliased: previous type {existing_ty:?}, new type {ty:?}"
                )));
            }
            return Ok(*ptr);
        }
        // Allocate in the function's entry block so the alloca is
        // hoisted out of any loop body. inkwell's `build_alloca`
        // emits at the current position, so we temporarily reposition.
        let entry_bb = self.func.get_first_basic_block().ok_or_else(|| {
            LlvmError::Codegen("ensure_let_slot: function has no entry block".into())
        })?;
        let cur = self.builder.get_insert_block();
        // Position at the start of the entry block so allocas group
        // at the top — LLVM mem2reg requires this canonical layout
        // to promote slots into SSA.
        if let Some(first_instr) = entry_bb.get_first_instruction() {
            self.builder.position_before(&first_instr);
        } else {
            self.builder.position_at_end(entry_bb);
        }
        let llvm_ty: inkwell::types::BasicTypeEnum<'ctx> = match ty {
            // AOT-1: F64 rides as i64 bits on the virtual stack, so its
            // let-slot is the same 64-bit-wide integer alloca as I64.
            // The `(idx, ty)` aliasing key keeps an I64 and an F64 slot
            // for the same index distinct, so the bit pattern never gets
            // reinterpreted across types.
            IrType::I64 | IrType::F64 => self.ctx.i64_type().into(),
            // Phase E.1: String / List* arena offsets ride on an i32
            // slot — matches the cranelift backend's pointer-as-i32
            // wire representation.
            //
            // Phase F.W7: `Closure` joins the i32-wide variants
            // (closure handle is an arena-relative i32 pointer at
            // the IR / cranelift / LLVM boundary alike).
            IrType::I32
            | IrType::Bool
            | IrType::Unit
            | IrType::String
            | IrType::ListInt
            | IrType::ListFloat
            | IrType::ListBool
            | IrType::ListString
            | IrType::ListSchema
            | IrType::ListList
            | IrType::Closure
            | IrType::Dict => self.ctx.i32_type().into(),
        };
        let name = format!("let_{idx}");
        let ptr = self
            .builder
            .build_alloca(llvm_ty, &name)
            .map_err(|e| LlvmError::Codegen(format!("let-slot {idx} alloca: {e}")))?;
        if let Some(bb) = cur {
            self.builder.position_at_end(bb);
        }
        self.let_slots.insert(idx, (ptr, ty));
        Ok(ptr)
    }

    // -- entry point ----------------------------------------------------

    pub(crate) fn lower_body(&mut self, body: &[TaggedOp]) -> Result<(), LlvmError> {
        for (ip, tagged) in body.iter().enumerate() {
            self.lower_op(ip, tagged)?;
        }
        // After `Op::Return` we positioned at a fresh "after_return_cont"
        // block which is dead and unterminated. Seal it with
        // `unreachable` so LLVM's verifier accepts the module. Same
        // pattern applies to the post-`Br` continuation block.
        if let Some(cur) = self.builder.get_insert_block() {
            if cur.get_terminator().is_none() {
                self.builder
                    .build_unreachable()
                    .map_err(|e| LlvmError::Codegen(format!("trailing unreachable: {e}")))?;
            }
        }
        Ok(())
    }

    // -- per-op lowering ------------------------------------------------

    pub(crate) fn lower_op(&mut self, ip: usize, tagged: &TaggedOp) -> Result<(), LlvmError> {
        let ip_hint = format!("ip={ip} op={:?}", tagged.op);
        // Phase H const-needle fast path: capture (and clear) the
        // `Op::ConstString` peek-state at the very start of every
        // `lower_op` dispatch. The `Op::Call` arm consults `prev_const_string`
        // to decide between the inline byte-scan and the extern shim.
        // Every other arm leaves `self.last_const_string` at `None` —
        // the only re-populator is the `Op::ConstString` arm at its
        // tail. Result: `prev_const_string.is_some()` iff the prior
        // emitted op was `Op::ConstString` and its value is still the
        // top-of-stack (no intervening op consumed it).
        let prev_const_string = self.last_const_string.take();
        match &tagged.op {
            // ---- literals ----
            Op::ConstI64(v) => {
                let c = self.ctx.i64_type().const_int(*v as u64, true);
                self.push(c, IrType::I64);
            }
            Op::ConstI32(v) => {
                let c = self.ctx.i32_type().const_int(*v as u32 as u64, false);
                self.push(c, IrType::I32);
            }
            Op::ConstBool(b) => {
                // Bool occupies an i32 slot on the IR's virtual stack.
                let c = self.ctx.i32_type().const_int(u64::from(*b), false);
                self.push(c, IrType::Bool);
            }
            Op::ConstF64(v) => {
                // AOT-1: materialise the `double` literal then bit-cast
                // to i64 so the operand stack stays integer-typed
                // (Option B). `v` is an `OrderedFloat<f64>`.
                let f = self.ctx.f64_type().const_float(v.into_inner());
                let bits = self
                    .builder
                    .build_bit_cast(f, self.ctx.i64_type(), &self.next_name("constf64_bits"))
                    .map_err(|e| LlvmError::Codegen(format!("ConstF64 bitcast: {e}")))?
                    .into_int_value();
                self.push(bits, IrType::F64);
            }

            // ---- locals / lets ----
            Op::LocalGet(idx) => {
                // Phase E.1: an active inline frame redirects
                // `LocalGet(i)` to the inlined call's `i`-th argument
                // instead of the entry-function's LLVM params.
                if let Some(frame) = self.inline_frames.last() {
                    let i = *idx as usize;
                    let tv = frame.params.get(i).ok_or_else(|| {
                        LlvmError::Codegen(format!(
                            "inline LocalGet({idx}) out of range — callee has {} params",
                            frame.params.len()
                        ))
                    })?;
                    // Preserve provenance across the inline-frame argument
                    // bind. The bundled `list_int_filter` body reads its
                    // closure parameter via `LocalGet(1)`; when the caller
                    // passed a literal `MakeClosure` (a `KnownClosure`
                    // handle), forwarding that provenance lets the body's
                    // per-element `CallClosure` devirtualise into a direct
                    // call. Only `KnownClosure` is propagated here — the
                    // self-recursion / fast-path-entry tags depend on the
                    // current function's `captures_ptr_param` / fast-path
                    // state, which a *callee* inline frame does not share,
                    // so forwarding those would be unsound.
                    let (val, prov) = (tv.val, tv.prov);
                    match prov {
                        Provenance::KnownClosure { .. } => {
                            self.push_with_prov(val, tv.ty, prov);
                        }
                        _ => self.push(val, tv.ty),
                    }
                } else {
                    let p = self.lookup_param(*idx)?;
                    // The legacy envelope walks all-i64; the buffer envelope
                    // walks (i32 ×4, i64). The IR has the right type on
                    // the param descriptor, but we don't carry it through
                    // LocalGet — re-derive from the LLVM param width.
                    let width = p.get_type().get_bit_width();
                    let ty = if width == 32 {
                        IrType::I32
                    } else {
                        IrType::I64
                    };
                    // Phase F.W7 self-recursion fast path: tag
                    // `LocalGet(0)` inside a lambda body with
                    // [`Provenance::OwnCapturesPtr`] so the prologue
                    // capture-load chain can stamp
                    // [`Provenance::OwnCaptureHandle`] on self-
                    // recursive handles. Only fires inside a lambda
                    // (param_base == 1 means the LLVM param 0 is
                    // `*state` and param 1 is the captures_ptr arg);
                    // the entry / helpers leave provenance at
                    // `None`.
                    if *idx == 0 && self.captures_ptr_param.is_some() {
                        self.push_with_prov(p, ty, Provenance::OwnCapturesPtr);
                    } else {
                        self.push(p, ty);
                    }
                }
            }
            Op::LetSet { idx, ty } => {
                let v = self.pop(&ip_hint)?;
                let mapped = self.remap_let_idx(*idx);
                let slot = self.ensure_let_slot(mapped, *ty)?;
                // Coerce on bool / null where the producer pushed an i32
                // slot but the let-slot was declared as the canonical
                // 32-bit width.
                let stored = self.coerce_to_let_ty(v, *ty)?;
                self.builder
                    .build_store(slot, stored)
                    .map_err(|e| LlvmError::Codegen(format!("LetSet store: {e}")))?;
                // Phase F.W7 self-recursion fast path: when storing a
                // closure handle whose provenance points back at the
                // enclosing lambda, remember the let-slot so a later
                // `LetGet` resurrects the same provenance. This is
                // what bridges the prologue's capture-load chain
                // (`LocalGet(0); LoadI32AtAbsolute { offset }; LetSet
                // { idx, Closure }`) and the recursive call site
                // (`LetGet { idx, Closure }; ...; CallClosure`).
                if let Provenance::OwnCaptureHandle {
                    offset,
                    self_fn_table_idx,
                } = v.prov
                {
                    if matches!(*ty, IrType::Closure) {
                        self.self_capture_let_slots
                            .insert(mapped, (offset, self_fn_table_idx));
                    }
                }
                // Phase D.2 fast-path entry: when storing a virtualised
                // closure produced by an in-body `MakeClosure` (no
                // arena/state available), remember the `fn_table_idx`
                // so the matching `LetGet` re-emits the provenance and
                // the downstream `CallClosure` can rewrite into a
                // direct call.
                if let Provenance::FastPathClosure { fn_table_idx } = v.prov {
                    if matches!(*ty, IrType::Closure) {
                        self.fast_path_closure_let_slots
                            .insert(mapped, fn_table_idx);
                    }
                }
                // Devirtualisation (W18): propagate `KnownClosure`
                // across the `LetSet` → `LetGet` chain so a closure
                // handle stored into a let then read back at a
                // `CallClosure` site keeps its compile-time
                // `fn_table_idx`. A `LetSet { Closure }` of any *other*
                // provenance overwrites the slot with a value we cannot
                // prove is the same single closure, so drop the entry —
                // a later `LetGet` then falls back to the runtime
                // switch. This invalidation is what keeps a slot that is
                // reassigned to a dynamically-chosen closure correct.
                match (v.prov, *ty) {
                    (Provenance::KnownClosure { fn_table_idx }, IrType::Closure) => {
                        self.known_closure_let_slots.insert(mapped, fn_table_idx);
                    }
                    (_, IrType::Closure) => {
                        self.known_closure_let_slots.remove(&mapped);
                    }
                    _ => {}
                }
                // Phase L W3: propagate `Provenance::ConstString`
                // across the `LetSet` → `LetGet` chain so the reduce
                // closure's `s` (set every iteration from the same
                // const literal "a" in the W3 source) can be picked
                // up by `Op::Add(String)` as a const-len operand.
                // Any non-const-string `LetSet` against the same idx
                // wipes the entry below.
                match (v.prov, *ty) {
                    (Provenance::ConstString { len, first_byte }, IrType::String) => {
                        self.const_string_let_slots
                            .insert(mapped, (len, first_byte));
                    }
                    (_, IrType::String) => {
                        // A non-const value just overwrote the slot —
                        // drop any stale const-string record so a
                        // later `LetGet` cannot fraudulently claim
                        // const-len status.
                        self.const_string_let_slots.remove(&mapped);
                    }
                    _ => {}
                }
            }
            Op::LetGet { idx, ty } => {
                // Phase E.1: remap the callee's let-idx against the
                // active inline frame so concurrent stdlib inlines
                // don't clash on slot numbers.
                let mapped = self.remap_let_idx(*idx);
                let slot = self.ensure_let_slot(mapped, *ty)?;
                let llvm_ty: inkwell::types::BasicTypeEnum<'ctx> = match *ty {
                    // AOT-1: F64 rides as i64 bits, so its let-slot loads
                    // back as an i64 (the raw bit pattern, reinterpreted
                    // as `double` only at the arithmetic / store site).
                    IrType::I64 | IrType::F64 => self.ctx.i64_type().into(),
                    IrType::I32
                    | IrType::Bool
                    | IrType::Unit
                    | IrType::String
                    | IrType::ListInt
                    | IrType::ListFloat
                    | IrType::ListBool
                    | IrType::ListString
                    | IrType::ListSchema
                    | IrType::ListList
                    | IrType::Closure
                    | IrType::Dict => self.ctx.i32_type().into(),
                };
                let name = self.next_name("letget");
                let v = self
                    .builder
                    .build_load(llvm_ty, slot, &name)
                    .map_err(|e| LlvmError::Codegen(format!("LetGet load: {e}")))?
                    .into_int_value();
                // Phase F.W7 self-recursion fast path: when the let-slot
                // was populated by the lambda prologue's self-capture
                // load chain, re-stamp the matching
                // [`Provenance::OwnCaptureHandle`] so the recursive
                // call site (which reads the closure handle via
                // `LetGet`) keeps the fast-path tag alive.
                if matches!(*ty, IrType::Closure) {
                    if let Some(&(offset, self_fn_table_idx)) =
                        self.self_capture_let_slots.get(&mapped)
                    {
                        self.push_with_prov(
                            v,
                            *ty,
                            Provenance::OwnCaptureHandle {
                                offset,
                                self_fn_table_idx,
                            },
                        );
                    } else if let Some(&fn_table_idx) =
                        self.fast_path_closure_let_slots.get(&mapped)
                    {
                        // Phase D.2 fast-path entry: re-stamp the
                        // virtualised-closure tag so the matching
                        // `CallClosure` keeps the direct-call rewrite
                        // available.
                        self.push_with_prov(v, *ty, Provenance::FastPathClosure { fn_table_idx });
                    } else if let Some(&fn_table_idx) = self.known_closure_let_slots.get(&mapped) {
                        // Devirtualisation (W18): re-stamp `KnownClosure`
                        // so a `CallClosure` reading this handle through
                        // the let chain emits a direct call (still
                        // loading the real captures_ptr) instead of the
                        // runtime switch.
                        self.push_with_prov(v, *ty, Provenance::KnownClosure { fn_table_idx });
                    } else {
                        self.push(v, *ty);
                    }
                } else if matches!(*ty, IrType::String) {
                    // Phase L W3: re-stamp `Provenance::ConstString`
                    // when the let-slot is known to hold a value
                    // sourced from `Op::ConstString`. Crucial for the
                    // reduce closure's `s` operand — the iter-body
                    // sets `s` from a const literal then `LetGet`s it
                    // into the `Op::Add(String)` rhs, so without
                    // propagation the const-len fast path can never
                    // fire across the let chain.
                    if let Some(&(len, first_byte)) = self.const_string_let_slots.get(&mapped) {
                        self.push_with_prov(v, *ty, Provenance::ConstString { len, first_byte });
                    } else {
                        self.push(v, *ty);
                    }
                } else {
                    self.push(v, *ty);
                }
            }

            // ---- arithmetic ----
            Op::Add(ty) => match ty {
                // Phase E.1: `Op::Add(IrType::String)` is the
                // pair-wise String + String form (the StrConcatN
                // fold only fires for compile-time-known chains —
                // `reduce("", (acc, s) => acc + s)` lowers to a
                // per-iter `Add(String)`).
                //
                // Phase I (W3 string-concat gap close): emit the
                // in-place-append fast path. The W3 reduce hot loop
                // walks `acc = acc + "a"` for N iters; under the
                // historical inlined-`concat` body that turned into
                // an O(N²) byte-copy storm because every iter
                // reallocated a fresh scratch record. The new
                // helper recognises the "lhs is the most recent
                // scratch alloc" case at runtime and extends the
                // record in place — total work drops to O(N) bytes,
                // matching `String::push_str`. The slow path stays
                // bit-identical with the historical lowering so
                // mixed-source string adds (const-pool literals,
                // out-of-order scratch records) still produce a
                // fresh record.
                IrType::String => self.emit_str_add_inplace_or_concat(&ip_hint)?,
                _ => self.emit_binop(&ip_hint, *ty, BinOp::Add)?,
            },
            Op::Sub(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Sub)?,
            Op::Mul(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Mul)?,
            Op::Div(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Div)?,
            Op::Mod(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Mod)?,
            Op::BitAnd(ty) => self.emit_binop(&ip_hint, *ty, BinOp::BitAnd)?,
            Op::ConvertI64ToF64 => self.emit_convert_i64_to_f64(&ip_hint)?,
            Op::F64ToI64Sat => self.emit_f64_to_i64_sat(&ip_hint)?,
            Op::F64Unary(op) => self.emit_f64_unary(&ip_hint, *op)?,
            Op::F64Pow => self.emit_f64_pow(&ip_hint)?,

            // ---- comparisons ----
            Op::Eq(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::EQ)?,
            Op::Ne(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::NE)?,
            Op::Lt(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SLT)?,
            Op::Le(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SLE)?,
            Op::Gt(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SGT)?,
            Op::Ge(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SGE)?,

            // ---- buffer-protocol I/O ----
            Op::LoadField { offset, ty } => self.emit_load_field(*offset, *ty)?,
            Op::StoreField {
                offset,
                ty,
                inplace,
            } => self.emit_store_field(&ip_hint, *offset, *ty, *inplace)?,

            // ---- pointer-indirect param loads (Phase 2 relon-rs surface) ----
            // String / List* `#main` parameters arrive in the input
            // buffer as a 4-byte buffer-relative offset to a tail
            // record. The IR's lowering pass emits `Op::LoadStringPtr`
            // (and its List* siblings) instead of `Op::LoadField {
            // ty: String }` so the dispatch stays unambiguous; we
            // share the same `emit_load_pointer_indirect_param` impl
            // for all variants.
            Op::LoadStringPtr { offset } => {
                self.emit_load_pointer_indirect_param(*offset, IrType::String)?
            }
            Op::LoadListIntPtr { offset } => {
                self.emit_load_pointer_indirect_param(*offset, IrType::ListInt)?
            }
            Op::LoadListFloatPtr { offset } => {
                self.emit_load_pointer_indirect_param(*offset, IrType::ListFloat)?
            }
            Op::LoadListBoolPtr { offset } => {
                self.emit_load_pointer_indirect_param(*offset, IrType::ListBool)?
            }
            Op::LoadListStringPtr { offset } => {
                self.emit_load_pointer_indirect_param(*offset, IrType::ListString)?
            }
            Op::LoadListSchemaPtr { offset } => {
                self.emit_load_pointer_indirect_param(*offset, IrType::ListSchema)?
            }
            Op::LoadListListPtr { offset } => {
                self.emit_load_pointer_indirect_param(*offset, IrType::ListList)?
            }

            // ---- ReadStringLen (Phase 2 — backs `length(s)` / `len(xs)`) ----
            // Pop arena-relative i32 record pointer, load the leading
            // 4-byte length prefix, zext to i64 and push. Used by the
            // bundled stdlib `length` (String) / `list_*_length` bodies
            // — every list record shares the `[len: u32 LE]` prefix
            // with String, so a single lowering covers both.
            Op::ReadStringLen => self.emit_read_string_len(&ip_hint)?,

            // ---- control flow ----
            Op::Block { result_ty, body } => self.emit_block(*result_ty, body)?,
            Op::Loop { result_ty, body } => self.emit_loop(*result_ty, body)?,
            Op::Br { label_depth } => self.emit_br(*label_depth)?,
            Op::BrIf { label_depth } => self.emit_br_if(&ip_hint, *label_depth)?,
            Op::If {
                result_ty,
                then_body,
                else_body,
            } => self.emit_if(&ip_hint, *result_ty, then_body, else_body)?,

            // ---- return ----
            Op::Return => self.emit_return(&ip_hint)?,

            // ---- Phase E.1: const-data pool ----
            Op::ConstString { idx, value } => {
                let off = self
                    .const_pool
                    .string_offsets
                    .get(idx)
                    .copied()
                    .ok_or_else(|| {
                        LlvmError::Codegen(format!(
                            "Op::ConstString {{ idx: {idx} }}: missing const-pool entry — \
                         did the host forget to lay out the pool blob before dispatch?"
                        ))
                    })?;
                let c = self.ctx.i32_type().const_int(u64::from(off), false);
                // Phase L W3: stamp const-len provenance so the
                // downstream `Op::Add(String)` lowering (via
                // `emit_str_add_inplace_or_concat`) can use the
                // compile-time-known length to elide the per-iter
                // `[len]` header reload and replace the rhs memcpy
                // with a single byte store when the literal is one
                // byte (the dominant cmp_lua W3 reduce shape). The
                // provenance only survives across `LetSet`/`LetGet`
                // for `IrType::String` (tracked in
                // `const_string_let_slots`) so non-String consumers
                // never observe it.
                let bytes = value.as_bytes();
                let len_u32 = u32::try_from(bytes.len()).map_err(|_| {
                    LlvmError::Codegen("ConstString length exceeds u32 range".into())
                })?;
                let first_byte = if bytes.len() == 1 {
                    Some(bytes[0])
                } else {
                    None
                };
                self.push_with_prov(
                    c,
                    IrType::String,
                    Provenance::ConstString {
                        len: len_u32,
                        first_byte,
                    },
                );
                // Phase H peek-state: record the literal bytes so the
                // next `lower_op` call can detect `Op::Call(contains)`
                // with this string still at top-of-stack and switch
                // to the inline byte-scan instead of the extern shim.
                // Cleared at the start of every `lower_op` — see the
                // `prev_const_string.take()` line at the dispatch
                // head — so a single intervening op (Push / Pop /
                // Add / ...) drops the optimisation cleanly.
                self.last_const_string = Some(bytes.to_vec());
            }

            // ---- Phase E.1: raw-memory primitives ----
            Op::LoadI32AtAbsolute { offset } => {
                // Phase F.W7 self-recursion fast path: when the base
                // (top-of-stack at this point) is the lambda's own
                // captures_ptr arg and the offset matches a recorded
                // self-recursive capture slot, the result is a
                // closure handle whose backing struct points back at
                // the enclosing lambda. Stash the provenance hint
                // so the downstream `LetSet/LetGet/CallClosure` chain
                // can short-circuit the indirect dispatch. The
                // sniff peeks at the stack-top without mutating it;
                // the actual load still flows through
                // `emit_load_at_absolute` so we don't fork the
                // raw-memory primitive's lowering.
                let prov_hint = self.peek_self_capture_provenance(*offset);
                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I32)?;
                if let Some(prov) = prov_hint {
                    if let Some(top) = self.stack.last_mut() {
                        top.prov = prov;
                    }
                }
            }
            Op::LoadI64AtAbsolute { offset } => {
                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I64)?
            }
            Op::LoadI8UAtAbsolute { offset } => {
                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I8U)?
            }
            Op::LoadF64AtAbsolute { offset } => {
                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::F64)?
            }
            Op::StoreI32AtAbsolute { offset } => {
                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I32)?
            }
            Op::StoreI64AtAbsolute { offset } => {
                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I64)?
            }
            Op::StoreI8AtAbsolute { offset } => {
                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I8)?
            }
            Op::StoreF64AtAbsolute { offset } => {
                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::F64)?
            }
            Op::MemcpyAtAbsolute => self.emit_memcpy_at_absolute(&ip_hint)?,
            Op::AllocScratch { size_bytes } => self.emit_alloc_scratch_static(*size_bytes)?,
            Op::AllocScratchDyn => self.emit_alloc_scratch_dyn(&ip_hint)?,
            Op::StrConcatN { operand_count } => self.emit_str_concat_n(&ip_hint, *operand_count)?,
            Op::IntToStr => self.emit_int_to_str(&ip_hint)?,
            Op::FloatToStr => self.emit_float_to_str(&ip_hint)?,

            // ---- Phase E.1 + E.2 call dispatch ----
            // stdlib indices (#278) route through the bundled-body
            // inline path (`emit_call_stdlib`); user-defined indices
            // (#279) resolve through the helper table populated by
            // `emit_module_funcs`.
            Op::Call {
                fn_index,
                arg_count,
                param_tys,
                ret_ty,
            } => {
                let stdlib_count = relon_ir::stdlib::stdlib_function_count();
                // Phase F.1: `contains(haystack, needle) -> Bool` short-
                // circuit. The bundled stdlib body is a hand-transcribed
                // O(s_len * p_len) byte scan that defeats LLVM's auto-
                // vectoriser on the inner compare loop (every iter
                // reloads the needle bytes through a let-slot). On the
                // W4 / W4_long cmp_lua rows that turns into a 3.4× /
                // 256× gap vs LuaJIT (which uses SIMD-accelerated
                // `string.find`). Route the call through the host shim
                // `relon_llvm_str_contains_arena` which defers to
                // `core::str::contains` — std's substring search backs
                // single-byte needles with SIMD `memchr` and uses a
                // Two-Way matcher for longer needles, closing the gap
                // without inventing a Relon-specific SIMD path.
                if *fn_index < stdlib_count
                    && relon_ir::stdlib::stdlib_function_index("contains") == Some(*fn_index)
                    && *arg_count == 2
                    && param_tys == &[IrType::String, IrType::String]
                    && *ret_ty == IrType::Bool
                {
                    // Phase H: when the needle was pushed by the
                    // immediately-preceding `Op::ConstString` (peek
                    // state populated at `lower_op` head), inline a
                    // tight byte-scan against the literal bytes.
                    // Skips the `relon_llvm_str_contains_arena` FFI
                    // boundary entirely — ~10-15 cycles of prologue /
                    // epilogue / IC atomic loads per call. The W4 /
                    // W4_long hot loops always hit this path (needle
                    // = `"x"` literal); dynamic-needle callers (e.g.
                    // `filter((s) => s.contains(other))` where
                    // `other` flows in via an outer let-slot) fall
                    // through to the existing Phase G extern shim.
                    if let Some(needle_bytes) = prev_const_string.as_deref() {
                        self.emit_str_contains_const_needle(&ip_hint, needle_bytes)?;
                    } else {
                        self.emit_str_contains_extern(&ip_hint)?;
                    }
                } else if *fn_index < stdlib_count {
                    self.emit_call_stdlib(&ip_hint, *fn_index, *arg_count, param_tys, *ret_ty)?
                } else {
                    self.emit_call(&ip_hint, *fn_index, *arg_count, param_tys, *ret_ty)?
                }
            }

            // ---- Phase F.W7: anon-Dict-return record ops ----
            // The IR lowering pass uses `AllocRootRecord` to bind a
            // per-record-local i32 alloca to `0` (the root sits at
            // `out_ptr + 0`); subsequent `StoreFieldAtRecord` ops use
            // the alloca-resident offset to compute the destination
            // address in the output buffer's fixed area.
            Op::AllocRootRecord { record_local_idx } => {
                self.emit_alloc_root_record(*record_local_idx)?
            }
            Op::StoreFieldAtRecord {
                record_local_idx,
                offset,
                ty,
            } => self.emit_store_field_at_record(&ip_hint, *record_local_idx, *offset, *ty)?,

            // ---- Phase F.W7: closure-as-value primitives ----
            Op::MakeClosure {
                fn_table_idx,
                captures,
                captures_size,
            } => self.emit_make_closure(&ip_hint, *fn_table_idx, captures, *captures_size)?,
            Op::CallClosure { param_tys, ret_ty } => {
                self.emit_call_closure(&ip_hint, param_tys, *ret_ty)?
            }

            // ---- Phase 0b family seams ----
            // The ops below are not yet lowered by the LLVM AOT backend.
            // They are listed EXPLICITLY (no `_ =>` wildcard) so that
            // adding a new `Op` variant fails to compile here — forcing a
            // deliberate decision instead of a silent runtime codegen
            // error. Each group delegates to a thin per-family entry
            // point living in that family's `codegen/<family>.rs` file,
            // so Phase 0b agents fill one family file each WITHOUT
            // touching this shared dispatch (zero merge conflicts). The
            // stubs return the same `LlvmError::Codegen` the catch-all
            // used to, so today's fallback behaviour is unchanged.

            // collections.rs — list/dict/sub-record construction
            Op::ConstListInt { .. }
            | Op::ConstListFloat { .. }
            | Op::ConstListBool { .. }
            | Op::ConstListString { .. }
            | Op::ConstDict { .. }
            | Op::DictGetByStringKey { .. }
            | Op::ListGetByIntIdx { .. }
            | Op::AllocSubRecord { .. }
            | Op::AllocScratchRecord { .. }
            | Op::PushRecordBase { .. }
            | Op::PushRecordBaseAbsolute { .. }
            | Op::StoreFieldAtRecordAbsolute { .. }
            | Op::EmitTailRecordFromAbsoluteAddr { .. }
            | Op::BuildVariantRecord { .. }
            | Op::BuildVariantRecordScratch { .. }
            | Op::BuildPointerList { .. } => {
                self.lower_collections_rest(ip, &ip_hint, &tagged.op)?
            }

            // control.rs — multi-way / select control flow
            Op::Select { .. } | Op::BrTable { .. } => {
                self.lower_control_rest(ip, &ip_hint, &tagged.op)?
            }

            // mem.rs — absolute-addressed field load
            Op::LoadFieldAtAbsolute { .. } => self.lower_mem_rest(ip, &ip_hint, &tagged.op)?,

            // call.rs — native dispatch + capability gate + trap
            Op::CallNative { .. } | Op::CheckCap { .. } | Op::Trap { .. } => {
                self.lower_call_rest(ip, &ip_hint, &tagged.op)?
            }

            // schema.rs — schema pointer / method dispatch
            Op::LoadSchemaPtr { .. } => self.lower_schema_rest(ip, &ip_hint, &tagged.op)?,

            // unicode.rs — *TableAddr long tail
            Op::CaseFoldTableAddr { .. }
            | Op::CombiningMarkRangesAddr
            | Op::WhitespaceRangesAddr
            | Op::DecompTableAddr { .. }
            | Op::CccTableAddr
            | Op::CompositionTableAddr
            | Op::FullCaseFoldTableAddr { .. }
            | Op::CasedRangesAddr
            | Op::CaseIgnorableRangesAddr
            | Op::TurkishCaseFoldTableAddr { .. } => {
                self.lower_unicode_rest(ip, &ip_hint, &tagged.op)?
            }
        }
        Ok(())
    }

    // -- Phase E.1: inline-call frame helpers --------------------------

    /// Translate a callee `LetGet/LetSet` index against the topmost
    /// inline frame. Mirrors cranelift's `remap_let_idx`.
    pub(crate) fn remap_let_idx(&self, idx: u32) -> u32 {
        match self.inline_frames.last() {
            Some(frame) => frame.let_offset.saturating_add(idx),
            None => idx,
        }
    }

    // -- helpers --------------------------------------------------------

    pub(crate) fn coerce_to_let_ty(
        &self,
        tv: TypedValue<'ctx>,
        target: IrType,
    ) -> Result<BasicValueEnum<'ctx>, LlvmError> {
        let want_width = match target {
            // AOT-1: F64 rides as i64 bits, so its let-slot is 64-wide
            // (same as I64). Coercion stays a width match — never an
            // int<->float cast — because the stack value is the raw
            // bit pattern, not a `double`.
            IrType::I64 | IrType::F64 => 64,
            IrType::I32
            | IrType::Bool
            | IrType::Unit
            | IrType::String
            | IrType::ListInt
            | IrType::ListFloat
            | IrType::ListBool
            | IrType::ListString
            | IrType::ListSchema
            | IrType::ListList
            | IrType::Closure
            | IrType::Dict => 32,
        };
        let have_width = tv.val.get_type().get_bit_width();
        if have_width == want_width {
            return Ok(tv.val.into());
        }
        let target_ty = if want_width == 64 {
            self.ctx.i64_type()
        } else {
            self.ctx.i32_type()
        };
        if have_width < want_width {
            self.builder
                .build_int_z_extend(tv.val, target_ty, "let_zext")
                .map(|v| v.as_basic_value_enum())
                .map_err(|e| LlvmError::Codegen(format!("let zext: {e}")))
        } else {
            self.builder
                .build_int_truncate(tv.val, target_ty, "let_trunc")
                .map(|v| v.as_basic_value_enum())
                .map_err(|e| LlvmError::Codegen(format!("let trunc: {e}")))
        }
    }

    // -- control flow ---------------------------------------------------
}

/// Inline lookup table used by `emit_load_field`. Picks the LLVM
/// integer type + the IR tag we push back onto the operand stack
/// for a Phase-B-supported scalar field type.
impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {}

// ---------------------------------------------------------------------------
// Phase E.1: raw-memory primitives, scratch allocator, StrConcatN.
// ---------------------------------------------------------------------------

impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {
    /// Map an `IrType` to the LLVM int type used for the operand stack
    /// representation. Used by `Op::MakeClosure` capture reads and
    /// `Op::CallClosure` return loads.
    pub(crate) fn ir_ty_to_llvm_int(
        &self,
        ty: IrType,
    ) -> Result<inkwell::types::IntType<'ctx>, LlvmError> {
        match ty {
            IrType::I64 | IrType::F64 => Ok(self.ctx.i64_type()),
            IrType::I32
            | IrType::Bool
            | IrType::Unit
            | IrType::String
            | IrType::ListInt
            | IrType::ListFloat
            | IrType::ListBool
            | IrType::ListString
            | IrType::ListSchema
            | IrType::ListList
            | IrType::Closure
            | IrType::Dict => Ok(self.ctx.i32_type()),
        }
    }
}

#[cfg(test)]
mod const_pool_tests {
    //! Byte-level layout pins for the `ConstList*` const-pool records.
    //!
    //! These are the cross-backend arena data contract: the bytes the
    //! LLVM `ConstPool` lays out for `ConstListInt` / `ConstListFloat`
    //! / `ConstListBool` must be byte-identical to what
    //! `relon_codegen_cranelift`'s `ConstPool::visit_const_list_*`
    //! produces (both backends copy the same blob into the arena
    //! prefix; a layout drift on one side silently corrupts the other's
    //! cached ET_REL). Both ConstPools are crate-private, so the
    //! parity is pinned here against the documented wire layout the
    //! cranelift `visit_const_list_*` port was matched to:
    //!
    //! * int / float: align 8, `[len: u32 LE][pad: u32 zero][i64/f64 LE]`
    //! * bool:        align 4, `[len: u32 LE][u8 0/1 tightly packed]`
    use super::*;
    use relon_ir::ir::{Func, Op, TaggedOp};
    use relon_parser::TokenRange;

    fn tagged(op: Op) -> TaggedOp {
        TaggedOp {
            op,
            range: TokenRange::default(),
        }
    }

    fn synth_module(body: Vec<TaggedOp>) -> IrModule {
        IrModule {
            funcs: vec![Func {
                name: "run_main".into(),
                params: vec![],
                ret: IrType::I64,
                body,
                range: TokenRange::default(),
            }],
            entry_func_index: Some(0),
            imports: vec![],
            closure_table: vec![],
        }
    }

    #[test]
    fn const_list_int_byte_layout() {
        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListInt {
            idx: 0,
            elements: vec![10, 20, 30],
        })]))
        .unwrap();
        assert_eq!(pool.list_int_offsets.get(&0).copied(), Some(0));
        // [len:u32=3][pad:4 zero][i64 x3 LE]
        assert_eq!(&pool.bytes[0..4], &3u32.to_le_bytes());
        assert_eq!(&pool.bytes[4..8], &[0u8; 4]);
        assert_eq!(&pool.bytes[8..16], &10i64.to_le_bytes());
        assert_eq!(&pool.bytes[16..24], &20i64.to_le_bytes());
        assert_eq!(&pool.bytes[24..32], &30i64.to_le_bytes());
        assert_eq!(pool.bytes.len(), 32);
    }

    #[test]
    fn const_list_float_byte_layout() {
        // f64 elements carried as their u64 LE bit-pattern (matches the
        // IR's `ConstListFloat { elements: Vec<u64> }` representation).
        let f0 = 1.5f64.to_bits();
        let f1 = (-2.0f64).to_bits();
        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListFloat {
            idx: 0,
            elements: vec![f0, f1],
        })]))
        .unwrap();
        assert_eq!(pool.list_float_offsets.get(&0).copied(), Some(0));
        assert_eq!(&pool.bytes[0..4], &2u32.to_le_bytes());
        assert_eq!(&pool.bytes[4..8], &[0u8; 4]);
        assert_eq!(&pool.bytes[8..16], &f0.to_le_bytes());
        assert_eq!(&pool.bytes[16..24], &f1.to_le_bytes());
        assert_eq!(pool.bytes.len(), 24);
    }

    #[test]
    fn const_list_bool_byte_layout() {
        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListBool {
            idx: 0,
            elements: vec![true, false, true],
        })]))
        .unwrap();
        assert_eq!(pool.list_bool_offsets.get(&0).copied(), Some(0));
        // [len:u32=3][1,0,1] tightly packed, no padding between bytes
        assert_eq!(&pool.bytes[0..4], &3u32.to_le_bytes());
        assert_eq!(&pool.bytes[4..7], &[1u8, 0, 1]);
        assert_eq!(pool.bytes.len(), 7);
    }

    #[test]
    fn const_list_alignment_across_records() {
        // A bool record (len 4 + 3 = 7 bytes, align-4 padding to 8)
        // followed by an int record must land the int header on an
        // 8-byte boundary so the i64 payload is 8-aligned.
        let pool = ConstPool::from_module(&synth_module(vec![
            tagged(Op::ConstListBool {
                idx: 0,
                elements: vec![true, false, true],
            }),
            tagged(Op::ConstListInt {
                idx: 1,
                elements: vec![42],
            }),
        ]))
        .unwrap();
        assert_eq!(pool.list_bool_offsets.get(&0).copied(), Some(0));
        // 7 bytes used → align_to(8) pads to offset 8 for the int record.
        assert_eq!(pool.list_int_offsets.get(&1).copied(), Some(8));
        assert_eq!(&pool.bytes[8..12], &1u32.to_le_bytes());
        assert_eq!(&pool.bytes[16..24], &42i64.to_le_bytes());
    }

    #[test]
    fn const_list_string_byte_layout() {
        // W5-P2 pointer-array layout. Elements "a","bb","ccc":
        //   String records first (4-aligned):
        //     off 0:  [slen=1]["a"]            -> 5 bytes, pad to 8
        //     off 8:  [slen=2]["bb"]           -> 6 bytes, pad to 16
        //     off 16: [slen=3]["ccc"]          -> 7 bytes, pad to 24
        //   header at off 24:
        //     [len=3][off_0=0][off_1=8][off_2=16]
        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListString {
            idx: 0,
            elements: vec!["a".into(), "bb".into(), "ccc".into()],
        })]))
        .unwrap();
        // String record "a" at offset 0.
        assert_eq!(&pool.bytes[0..4], &1u32.to_le_bytes());
        assert_eq!(&pool.bytes[4..5], b"a");
        // "bb" at offset 8 (4-aligned after the 5-byte "a" record).
        assert_eq!(&pool.bytes[8..12], &2u32.to_le_bytes());
        assert_eq!(&pool.bytes[12..14], b"bb");
        // "ccc" at offset 16.
        assert_eq!(&pool.bytes[16..20], &3u32.to_le_bytes());
        assert_eq!(&pool.bytes[20..23], b"ccc");
        // Header at offset 24.
        let h = pool.list_string_offsets.get(&0).copied();
        assert_eq!(h, Some(24));
        assert_eq!(&pool.bytes[24..28], &3u32.to_le_bytes());
        assert_eq!(&pool.bytes[28..32], &0u32.to_le_bytes());
        assert_eq!(&pool.bytes[32..36], &8u32.to_le_bytes());
        assert_eq!(&pool.bytes[36..40], &16u32.to_le_bytes());
        assert_eq!(pool.bytes.len(), 40);
    }

    #[test]
    fn duplicate_const_list_idx_is_noop() {
        let pool = ConstPool::from_module(&synth_module(vec![
            tagged(Op::ConstListInt {
                idx: 0,
                elements: vec![1, 2],
            }),
            tagged(Op::ConstListInt {
                idx: 0,
                elements: vec![1, 2],
            }),
        ]))
        .unwrap();
        // One record only: 8 header + 2*8 payload = 24.
        assert_eq!(pool.bytes.len(), 24);
    }
}

#[cfg(test)]
mod devirt_tests {
    //! Soundness unit tests for the W18 closure-devirtualisation
    //! capture analysis. These exercise the IR-scan that decides which
    //! captures may be stamped `KnownClosure` (→ direct call) vs left as
    //! a genuinely-dynamic dispatch (→ runtime switch). Getting this
    //! wrong is a silent miscompile, so the analysis is pinned here
    //! independent of any end-to-end source.
    use super::*;
    use relon_ir::ir::{ClosureCapture, Func, IrType, Op, TaggedOp};
    use relon_parser::TokenRange;

    fn op(o: Op) -> TaggedOp {
        TaggedOp {
            op: o,
            range: TokenRange::default(),
        }
    }

    fn make_closure(fn_table_idx: u32, captures: Vec<ClosureCapture>) -> Op {
        let captures_size = captures.iter().map(|c| c.offset + 8).max().unwrap_or(0);
        Op::MakeClosure {
            fn_table_idx,
            captures,
            captures_size,
        }
    }

    fn cap(let_idx: u32, offset: u32) -> ClosureCapture {
        ClosureCapture {
            let_idx,
            ty: IrType::Closure,
            offset,
        }
    }

    fn entry_with_body(body: Vec<TaggedOp>) -> Func {
        Func {
            name: "run_main".into(),
            params: vec![IrType::I32],
            ret: IrType::I32,
            body,
            range: TokenRange::default(),
        }
    }

    /// A capture of a *known, non-self* closure is recorded so the
    /// capturing lambda's body can devirtualise the call against it.
    /// Mirrors the W18 predicate `(k) => is_prime(k, 2)` capturing the
    /// `is_prime` closure (`fn_table_idx=0`).
    #[test]
    fn records_known_non_self_capture() {
        // let0 := MakeClosure(K=0)  ; the `is_prime` binding
        // MakeClosure(L=1) capturing let0 at offset 0 ; the predicate
        let body = vec![
            op(make_closure(0, vec![cap(0, 0)])), // is_prime self-capture
            op(Op::LetSet {
                idx: 0,
                ty: IrType::Closure,
            }),
            op(make_closure(1, vec![cap(0, 0)])), // predicate captures is_prime
            op(Op::Call {
                fn_index: 14,
                arg_count: 2,
                param_tys: vec![IrType::ListInt, IrType::Closure],
                ret_ty: IrType::ListInt,
            }),
        ];
        let entry = entry_with_body(body);
        let table = build_known_capture_table(&entry, &[], &[]);
        // Lambda L=1 (the predicate) captures known closure K=0 at
        // offset 0.
        assert_eq!(
            table.get(&1).map(Vec::as_slice),
            Some(&[(0u32, 0u32)][..]),
            "predicate (L=1) must record its is_prime (K=0) capture as known"
        );
        // L=0 is_prime's own capture is a SELF capture (K==L==0) — it
        // must NOT appear here (the self-capture table owns it, and its
        // captures_ptr-reuse direct path is strictly better).
        assert!(
            !table.contains_key(&0),
            "self-capture (K==L) must be excluded from the known-capture table"
        );
    }

    /// When a closure let-slot is reassigned to a value that is NOT a
    /// literal `MakeClosure` (a genuinely-dynamic closure), the capture
    /// must NOT be recorded — the body keeps the runtime switch. This is
    /// the correctness red line: devirtualise only a provably-unique
    /// callee.
    #[test]
    fn drops_reassigned_dynamic_closure_slot() {
        // let0 := MakeClosure(0)        ; known
        // let0 := <some other Closure>  ; reassigned, now dynamic
        // MakeClosure(2) capturing let0 ; must NOT be recorded
        let body = vec![
            op(make_closure(0, vec![cap(0, 0)])),
            op(Op::LetSet {
                idx: 0,
                ty: IrType::Closure,
            }),
            // A bare `LetSet { Closure }` NOT preceded by a MakeClosure —
            // models a closure that arrived from somewhere unprovable
            // (a param, a phi, a different binding).
            op(Op::LetGet {
                idx: 5,
                ty: IrType::Closure,
            }),
            op(Op::LetSet {
                idx: 0,
                ty: IrType::Closure,
            }),
            op(make_closure(2, vec![cap(0, 0)])),
            op(Op::LetSet {
                idx: 9,
                ty: IrType::Closure,
            }),
        ];
        let entry = entry_with_body(body);
        let table = build_known_capture_table(&entry, &[], &[]);
        assert!(
            !table.contains_key(&2),
            "a capture of a reassigned (dynamic) closure slot must NOT be \
             recorded — the call must keep the runtime switch"
        );
    }

    /// The binding `LetSet` that immediately follows a known
    /// `MakeClosure` must NOT clear the slot it just established (the
    /// ordering bug fixed during development). A later capture of that
    /// slot is still recorded.
    #[test]
    fn binding_letset_does_not_clear_its_own_slot() {
        let body = vec![
            op(make_closure(3, vec![])),
            op(Op::LetSet {
                idx: 7,
                ty: IrType::Closure,
            }),
            op(make_closure(4, vec![cap(7, 0)])),
            op(Op::LetSet {
                idx: 8,
                ty: IrType::Closure,
            }),
        ];
        let entry = entry_with_body(body);
        let table = build_known_capture_table(&entry, &[], &[]);
        assert_eq!(
            table.get(&4).map(Vec::as_slice),
            Some(&[(0u32, 3u32)][..]),
            "L=4 must record its capture of known closure K=3 at offset 0"
        );
    }
}