relon-codegen-llvm 0.1.0-rc2

LLVM-backed AOT evaluator for Relon (Phase A bootstrap)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
//! `Op`-family: string construction + search.
//!
//! StrConcatN, the `Add(String)` in-place-append / concat fast path, and
//! the `contains` const-needle / extern-shim lowerings (plus their
//! libc/host-shim declarations).

use inkwell::module::Linkage;
use inkwell::values::{BasicMetadataValueEnum, BasicValueEnum, FunctionValue, IntValue};
use inkwell::{AddressSpace, IntPredicate};

use relon_ir::ir::IrType;

use crate::error::LlvmError;
use crate::state::{ARENA_STATE_OFFSET_SCRATCH_BASE, ARENA_STATE_OFFSET_SCRATCH_CURSOR};

use super::*;

impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {
    /// Lower `Op::StrConcatN { operand_count }`. Pops N i32 arena
    /// offsets, sums their `[len: u32]` headers, allocates one scratch
    /// record sized `total + 4`, stamps the header, then memcpys each
    /// operand's payload at the running cursor. Pushes the resulting
    /// i32 offset. Mirrors cranelift's `emit_str_concat_n`.
    pub(crate) fn emit_str_concat_n(
        &mut self,
        ip_hint: &str,
        operand_count: u32,
    ) -> Result<(), LlvmError> {
        if operand_count < 2 {
            return Err(LlvmError::Codegen(format!(
                "Op::StrConcatN with operand_count={operand_count} (expected >= 2)"
            )));
        }
        let n = operand_count as usize;
        let i32_t = self.ctx.i32_type();
        // Pop N i32 offsets; reverse so source-order matches stack-
        // order (deepest leaf is the first operand).
        let mut offs: Vec<IntValue<'ctx>> = Vec::with_capacity(n);
        for _ in 0..n {
            offs.push(self.pop_int(ip_hint)?);
        }
        offs.reverse();
        // Load each operand's `[len: u32]` header once.
        let mut lens: Vec<IntValue<'ctx>> = Vec::with_capacity(n);
        for off in &offs {
            let addr = self.arena_addr_i32_checked_const(*off, 4, "StrConcatN operand len")?;
            let name = self.next_name("strconcat_len");
            let l = self
                .builder
                .build_load(i32_t, addr, &name)
                .map_err(|e| LlvmError::Codegen(format!("StrConcatN len load: {e}")))?
                .into_int_value();
            lens.push(l);
        }
        // total_len = Σ lens.
        let mut total_len = lens[0];
        for v in &lens[1..] {
            let name = self.next_name("strconcat_sumlen");
            total_len = self
                .builder
                .build_int_add(total_len, *v, &name)
                .map_err(|e| LlvmError::Codegen(format!("StrConcatN sum: {e}")))?;
        }
        // record_size = total_len + 4 (header).
        let four = i32_t.const_int(4, false);
        let name = self.next_name("strconcat_recsize");
        let record_size = self
            .builder
            .build_int_add(total_len, four, &name)
            .map_err(|e| LlvmError::Codegen(format!("StrConcatN record_size: {e}")))?;
        // Allocate the scratch record.
        self.emit_alloc_scratch_common(record_size)?;
        let base_off = self.pop_int(ip_hint)?;
        // Write header: i32.store(base, total_len).
        let base_abs = self.arena_addr_i32_checked_const(base_off, 4, "StrConcatN header")?;
        self.builder
            .build_store(base_abs, total_len)
            .map_err(|e| LlvmError::Codegen(format!("StrConcatN header store: {e}")))?;
        // Walk operands in source order, copying payloads at the
        // running cursor.
        let name = self.next_name("strconcat_cursor0");
        let mut cursor_off = self
            .builder
            .build_int_add(base_off, four, &name)
            .map_err(|e| LlvmError::Codegen(format!("StrConcatN cursor init: {e}")))?;
        for i in 0..n {
            let len = lens[i];
            let name = self.next_name("strconcat_srcoff");
            let src_off_payload = self
                .builder
                .build_int_add(offs[i], four, &name)
                .map_err(|e| LlvmError::Codegen(format!("StrConcatN src off: {e}")))?;
            let dst_ptr = self.arena_addr_i32_checked(cursor_off, len, "StrConcatN dst")?;
            let src_ptr = self.arena_addr_i32_checked(src_off_payload, len, "StrConcatN src")?;
            let i64_t = self.ctx.i64_type();
            let name = self.next_name("strconcat_lenzext");
            let len64 = self
                .builder
                .build_int_z_extend(len, i64_t, &name)
                .map_err(|e| LlvmError::Codegen(format!("StrConcatN len zext: {e}")))?;
            self.builder
                .build_memcpy(dst_ptr, 1, src_ptr, 1, len64)
                .map_err(|e| LlvmError::Codegen(format!("StrConcatN memcpy: {e}")))?;
            let name = self.next_name("strconcat_cursornext");
            cursor_off = self
                .builder
                .build_int_add(cursor_off, len, &name)
                .map_err(|e| LlvmError::Codegen(format!("StrConcatN cursor bump: {e}")))?;
        }
        // Push the resulting record offset.
        self.push(base_off, IrType::String);
        Ok(())
    }

    /// Lower `Op::Add(IrType::String)` with the W3 reduce-accumulator
    /// fast path. Pops `[lhs_off, rhs_off]` (i32 arena offsets); emits a
    /// runtime branch that picks between:
    ///
    /// * **In-place append (fast)** — when `lhs` is the most recent
    ///   scratch allocation (`lhs_off + 4 + lhs_len == scratch_base +
    ///   scratch_cursor`), extend the existing record by `rhs_len`
    ///   bytes. Updates the header in-place, copies only the rhs
    ///   payload, bumps `scratch_cursor` by `rhs_len`. Result offset =
    ///   `lhs_off`. This is the W3 hot loop's steady-state path: every
    ///   iteration's freshly-built accumulator is the most recent
    ///   allocation, so concatenating one more byte costs O(1) (a
    ///   single byte store + cursor bump) instead of the historical
    ///   O(N) re-copy of the running accumulator.
    /// * **Full alloc + copy (slow)** — when the lhs sits somewhere
    ///   else in the arena (e.g. const-pool literal, scratch alloc
    ///   from a different sub-expression). Replicates the historical
    ///   `concat` stdlib body: allocate `lhs_len + rhs_len + 4` bytes
    ///   of scratch, stamp the header, memcpy both payloads. Result
    ///   offset = the freshly-allocated base.
    ///
    /// The two arms merge at a phi node, and the resulting i32 offset
    /// is pushed back tagged as [`IrType::String`].
    ///
    /// ## Correctness ground
    ///
    /// The in-place mutation overwrites both:
    /// * the existing `[len: u32]` header at `[lhs_off..lhs_off+4]`,
    /// * the bytes immediately past the existing payload, at
    ///   `[lhs_off+4+lhs_len .. lhs_off+4+lhs_len+rhs_len]`.
    ///
    /// The guard `lhs_off + 4 + lhs_len == scratch_base +
    /// scratch_cursor` ensures the bytes past the payload are inside
    /// the unallocated scratch tail — no other live data sits there.
    /// The result offset shares its base with the lhs, so any
    /// subsequent reader that previously held `lhs_off` would now see
    /// the longer record — but in the reduce pattern the lhs slot
    /// (`acc`) is immediately overwritten by the `LetSet` that follows
    /// `Op::Add(String)`, so no stale alias remains.
    ///
    /// The fast path also keeps `scratch_cursor` advanced by exactly
    /// the same byte count that the slow path would have advanced it
    /// for the fresh record (`rhs_len` extra bytes vs `lhs_len +
    /// rhs_len + 4` extra bytes for a full copy), so the arena's
    /// out-of-bounds budget is *strictly tighter* than the historical
    /// path — there is no new failure mode where the fast path
    /// exceeds the arena while the slow path would have fit.
    pub(crate) fn emit_str_add_inplace_or_concat(
        &mut self,
        ip_hint: &str,
    ) -> Result<(), LlvmError> {
        let arena_base_ptr = self.arena_base_ptr.ok_or_else(|| {
            LlvmError::Codegen(
                "Op::Add(String) outside buffer-protocol entry shape (no arena_base)".into(),
            )
        })?;
        let state_ptr = self.state_ptr.ok_or_else(|| {
            LlvmError::Codegen(
                "Op::Add(String) outside buffer-protocol entry shape (no state)".into(),
            )
        })?;
        let i32_t = self.ctx.i32_type();
        let i8_t = self.ctx.i8_type();
        let i64_t = self.ctx.i64_type();

        // Pop in reverse order: stack is `[lhs, rhs]`, top is rhs.
        // Phase L W3: keep the TypedValue so we can read provenance
        // (notably `Provenance::ConstString { len, first_byte }`) to
        // pick the const-len fast path below. LLVM cannot prove the
        // const length on its own — the rhs offset is a runtime i32
        // that happens to point into the const-pool prefix, and the
        // `[len]` header at that offset is reloaded every iteration
        // because the in-place append's header store at `lhs_addr`
        // aliases against it from the optimiser's point of view.
        let rhs_tv = self.pop(ip_hint)?;
        let lhs_tv = self.pop(ip_hint)?;
        let rhs_off = rhs_tv.val;
        let lhs_off = lhs_tv.val;
        let rhs_const_len: Option<(u32, Option<u8>)> = match rhs_tv.prov {
            Provenance::ConstString { len, first_byte } => Some((len, first_byte)),
            _ => None,
        };
        // SAFETY: when the *lhs* is sourced from `Op::ConstString` the
        // operand points into the per-module const-pool prefix (read-
        // only). Allowing the in-place fast path to fire in that case
        // would write the new `[len]` header — and the appended payload
        // — *into the const pool*, corrupting every subsequent
        // `Op::ConstString` load. We deliberately do **not** propagate
        // const-len knowledge for the lhs: keep the runtime `[len]`
        // load + the `lhs_end == scratch_end` runtime guard. In
        // practice the const-pool record sits at a fixed prefix offset
        // and the scratch tail is past every literal, so the guard
        // mismatches and the slow path (fresh scratch alloc + double
        // memcpy) takes over for the W3 reduce's first iteration
        // (`acc = "" + "a"`). The const-len optimisation is restricted
        // to the rhs slot.
        let lhs_const_len: Option<u32> = None;
        // Bind to silence the unused-binding lint while keeping the
        // structural symmetry with `rhs_const_len`.
        let _ = lhs_tv;

        // Load lhs.len and rhs.len from header word at offset 0 of
        // each record. Phase L W3: when the operand is known
        // const-string (provenance carries the literal byte length),
        // skip the per-iter `[len]` header load and feed LLVM an
        // i32 const — this removes the alias hazard between the
        // in-place store at `lhs_addr` and the rhs header read.
        let lhs_addr = self.arena_addr_i32_checked_const(lhs_off, 4, "Add(String) lhs len")?;
        let lhs_len = if let Some(len) = lhs_const_len {
            i32_t.const_int(u64::from(len), false)
        } else {
            self.builder
                .build_load(i32_t, lhs_addr, "stradd_lhs_len")
                .map_err(|e| LlvmError::Codegen(format!("Add(String) lhs len load: {e}")))?
                .into_int_value()
        };
        let rhs_len = if let Some((len, _)) = rhs_const_len {
            i32_t.const_int(u64::from(len), false)
        } else {
            let rhs_addr = self.arena_addr_i32_checked_const(rhs_off, 4, "Add(String) rhs len")?;
            self.builder
                .build_load(i32_t, rhs_addr, "stradd_rhs_len")
                .map_err(|e| LlvmError::Codegen(format!("Add(String) rhs len load: {e}")))?
                .into_int_value()
        };

        // Read scratch_base + scratch_cursor from the arena state.
        let scratch_cur_gep = unsafe {
            self.builder
                .build_in_bounds_gep(
                    i8_t,
                    state_ptr,
                    &[i32_t.const_int(u64::from(ARENA_STATE_OFFSET_SCRATCH_CURSOR), false)],
                    "stradd_scratch_cur_gep",
                )
                .map_err(|e| LlvmError::Codegen(format!("scratch_cur GEP: {e}")))?
        };
        let scratch_cur = self
            .builder
            .build_load(i32_t, scratch_cur_gep, "stradd_scratch_cur")
            .map_err(|e| LlvmError::Codegen(format!("scratch_cur load: {e}")))?
            .into_int_value();
        let scratch_base_gep = unsafe {
            self.builder
                .build_in_bounds_gep(
                    i8_t,
                    state_ptr,
                    &[i32_t.const_int(u64::from(ARENA_STATE_OFFSET_SCRATCH_BASE), false)],
                    "stradd_scratch_base_gep",
                )
                .map_err(|e| LlvmError::Codegen(format!("scratch_base GEP: {e}")))?
        };
        let scratch_base = self
            .builder
            .build_load(i32_t, scratch_base_gep, "stradd_scratch_base")
            .map_err(|e| LlvmError::Codegen(format!("scratch_base load: {e}")))?
            .into_int_value();

        // lhs_end = lhs_off + 4 + lhs_len
        let four = i32_t.const_int(4, false);
        let lhs_off_plus_4 = self
            .builder
            .build_int_add(lhs_off, four, "stradd_lhs_off_plus4")
            .map_err(|e| LlvmError::Codegen(format!("stradd lhs+4: {e}")))?;
        let lhs_end = self
            .builder
            .build_int_add(lhs_off_plus_4, lhs_len, "stradd_lhs_end")
            .map_err(|e| LlvmError::Codegen(format!("stradd lhs_end: {e}")))?;
        // scratch_end = scratch_base + scratch_cursor
        let scratch_end = self
            .builder
            .build_int_add(scratch_base, scratch_cur, "stradd_scratch_end")
            .map_err(|e| LlvmError::Codegen(format!("stradd scratch_end: {e}")))?;
        let is_tail = self
            .builder
            .build_int_compare(IntPredicate::EQ, lhs_end, scratch_end, "stradd_is_tail")
            .map_err(|e| LlvmError::Codegen(format!("stradd cmp: {e}")))?;

        let fast_bb = self.ctx.append_basic_block(self.func, "stradd_fast");
        let slow_bb = self.ctx.append_basic_block(self.func, "stradd_slow");
        let merge_bb = self.ctx.append_basic_block(self.func, "stradd_merge");
        self.builder
            .build_conditional_branch(is_tail, fast_bb, slow_bb)
            .map_err(|e| LlvmError::Codegen(format!("stradd branch: {e}")))?;

        // --- fast path: in-place append ---
        self.builder.position_at_end(fast_bb);
        let total_len_fast = self
            .builder
            .build_int_add(lhs_len, rhs_len, "stradd_fast_total")
            .map_err(|e| LlvmError::Codegen(format!("stradd fast total: {e}")))?;
        // store updated header
        self.builder
            .build_store(lhs_addr, total_len_fast)
            .map_err(|e| LlvmError::Codegen(format!("stradd fast header store: {e}")))?;
        // Append the rhs payload onto the lhs tail. Phase L W3: when
        // the rhs is a known const string (the dominant W3 reduce
        // shape — `acc + "a"`), specialise the copy:
        //   * len == 1 — emit a single `store i8 byte, ptr` against
        //     the lhs tail; bypasses the memcpy intrinsic entirely
        //     so the LLVM mid-end sees just a one-byte store + cursor
        //     bump (matching `String::push_str("a")`).
        //   * len > 1 — still use `build_memcpy`, but pass an i64
        //     const for the size so LLVM's `expand-memcpy` lowering
        //     unrolls to inline loads/stores instead of an indirect
        //     `callq *memcpy`.
        //   * non-const — historical path: zext runtime rhs_len to
        //     i64 and hand it to memcpy.
        let fast_dst = self.arena_addr_i32_checked(lhs_end, rhs_len, "Add(String) fast dst")?;
        match rhs_const_len {
            Some((1, Some(byte))) => {
                let byte_const = i8_t.const_int(u64::from(byte), false);
                self.builder
                    .build_store(fast_dst, byte_const)
                    .map_err(|e| {
                        LlvmError::Codegen(format!("stradd fast inline-byte store: {e}"))
                    })?;
            }
            Some((len, _)) => {
                let fast_src = self.arena_addr_i32_offset_checked_const(
                    rhs_off,
                    4,
                    len,
                    "Add(String) fast src",
                )?;
                let rhs_len64 = i64_t.const_int(u64::from(len), false);
                self.builder
                    .build_memcpy(fast_dst, 1, fast_src, 1, rhs_len64)
                    .map_err(|e| {
                        LlvmError::Codegen(format!("stradd fast memcpy (const-len): {e}"))
                    })?;
            }
            None => {
                let fast_src = self.arena_addr_i32_offset_checked(
                    rhs_off,
                    4,
                    rhs_len,
                    "Add(String) fast src",
                )?;
                let rhs_len64 = self
                    .builder
                    .build_int_z_extend(rhs_len, i64_t, "stradd_rhs_len64")
                    .map_err(|e| LlvmError::Codegen(format!("stradd rhs_len zext: {e}")))?;
                self.builder
                    .build_memcpy(fast_dst, 1, fast_src, 1, rhs_len64)
                    .map_err(|e| LlvmError::Codegen(format!("stradd fast memcpy: {e}")))?;
            }
        }
        // bump scratch_cursor by rhs_len
        let new_cur = self
            .builder
            .build_int_add(scratch_cur, rhs_len, "stradd_fast_newcur")
            .map_err(|e| LlvmError::Codegen(format!("stradd fast new cur: {e}")))?;
        self.builder
            .build_store(scratch_cur_gep, new_cur)
            .map_err(|e| LlvmError::Codegen(format!("stradd fast cursor store: {e}")))?;
        let fast_end_bb = self.builder.get_insert_block().unwrap();
        self.builder
            .build_unconditional_branch(merge_bb)
            .map_err(|e| LlvmError::Codegen(format!("stradd fast->merge: {e}")))?;

        // --- slow path: full alloc + double memcpy ---
        self.builder.position_at_end(slow_bb);
        // total_len = lhs_len + rhs_len
        let total_len_slow = self
            .builder
            .build_int_add(lhs_len, rhs_len, "stradd_slow_total")
            .map_err(|e| LlvmError::Codegen(format!("stradd slow total: {e}")))?;
        // record_size = total_len + 4
        let record_size = self
            .builder
            .build_int_add(total_len_slow, four, "stradd_slow_recsize")
            .map_err(|e| LlvmError::Codegen(format!("stradd slow recsize: {e}")))?;
        self.emit_alloc_scratch_common(record_size)?;
        let base_off = self.pop_int(ip_hint)?;
        // write header at base
        let base_addr =
            self.arena_addr_i32_checked_const(base_off, 4, "Add(String) slow header")?;
        self.builder
            .build_store(base_addr, total_len_slow)
            .map_err(|e| LlvmError::Codegen(format!("stradd slow header store: {e}")))?;
        // memcpy lhs payload to base+4
        let base_plus_4 = self
            .builder
            .build_int_add(base_off, four, "stradd_slow_basep4")
            .map_err(|e| LlvmError::Codegen(format!("stradd slow base+4: {e}")))?;
        let dst1 =
            self.arena_addr_i32_offset_checked(base_off, 4, lhs_len, "Add(String) slow lhs dst")?;
        let src1 =
            self.arena_addr_i32_offset_checked(lhs_off, 4, lhs_len, "Add(String) slow lhs src")?;
        // Phase L W3: hand LLVM an i64 const memcpy size whenever
        // the lhs / rhs comes from `Op::ConstString` so the
        // `expand-memcpy` lowering can unroll to inline stores
        // instead of an indirect `callq *memcpy`. Falls back to the
        // historical zext path for non-const operands.
        let lhs_len64: IntValue<'ctx> = if let Some(len) = lhs_const_len {
            i64_t.const_int(u64::from(len), false)
        } else {
            self.builder
                .build_int_z_extend(lhs_len, i64_t, "stradd_slow_lhs64")
                .map_err(|e| LlvmError::Codegen(format!("stradd slow lhs_len zext: {e}")))?
        };
        self.builder
            .build_memcpy(dst1, 1, src1, 1, lhs_len64)
            .map_err(|e| LlvmError::Codegen(format!("stradd slow lhs memcpy: {e}")))?;
        // memcpy rhs payload to base+4+lhs_len
        let lhs_dst_cursor = self
            .builder
            .build_int_add(base_plus_4, lhs_len, "stradd_slow_cur2")
            .map_err(|e| LlvmError::Codegen(format!("stradd slow cur2: {e}")))?;
        let dst2 =
            self.arena_addr_i32_checked(lhs_dst_cursor, rhs_len, "Add(String) slow rhs dst")?;
        let src2 =
            self.arena_addr_i32_offset_checked(rhs_off, 4, rhs_len, "Add(String) slow rhs src")?;
        let rhs_len64_slow: IntValue<'ctx> = if let Some((len, _)) = rhs_const_len {
            i64_t.const_int(u64::from(len), false)
        } else {
            self.builder
                .build_int_z_extend(rhs_len, i64_t, "stradd_slow_rhs64")
                .map_err(|e| LlvmError::Codegen(format!("stradd slow rhs_len zext: {e}")))?
        };
        self.builder
            .build_memcpy(dst2, 1, src2, 1, rhs_len64_slow)
            .map_err(|e| LlvmError::Codegen(format!("stradd slow rhs memcpy: {e}")))?;
        let slow_end_bb = self.builder.get_insert_block().unwrap();
        self.builder
            .build_unconditional_branch(merge_bb)
            .map_err(|e| LlvmError::Codegen(format!("stradd slow->merge: {e}")))?;

        // --- merge: phi of lhs_off / base_off ---
        self.builder.position_at_end(merge_bb);
        let phi = self
            .builder
            .build_phi(i32_t, "stradd_result")
            .map_err(|e| LlvmError::Codegen(format!("stradd phi: {e}")))?;
        let lhs_off_val: BasicValueEnum<'ctx> = lhs_off.into();
        let base_off_val: BasicValueEnum<'ctx> = base_off.into();
        phi.add_incoming(&[(&lhs_off_val, fast_end_bb), (&base_off_val, slow_end_bb)]);
        let result = phi.as_basic_value().into_int_value();
        // arena_base_ptr is referenced implicitly inside the checked
        // arena-address helpers; bind it to silence the borrow checker.
        let _ = arena_base_ptr;
        self.push(result, IrType::String);
        Ok(())
    }

    /// Phase F.1: lower `contains(haystack: String, needle: String) ->
    /// Bool` by emitting a direct extern call to
    /// `relon_llvm_str_contains_arena` instead of inlining the bundled
    /// stdlib body. See the `str_helpers` module docs for the ABI and
    /// the rationale (W4 / W4_long gap vs LuaJIT closed by std's
    /// SIMD-backed `str::contains`).
    ///
    /// Operand stack contract: pops `needle_off` (top), then
    /// `haystack_off`. Pushes the i32 0/1 result tagged as
    /// [`IrType::Bool`] so downstream `If` / `BrIf` ops see the same
    /// width the inlined body would have produced.
    pub(crate) fn emit_str_contains_extern(&mut self, ip_hint: &str) -> Result<(), LlvmError> {
        // Pop in reverse order: IR pushes `[haystack, needle]`, so the
        // top-of-stack is the needle. We need to materialise the
        // pointers in declaration order (haystack first) for the call,
        // so collect the offsets first and resolve to pointers below.
        let needle_off = self.pop_int(ip_hint)?;
        let haystack_off = self.pop_int(ip_hint)?;
        self.emit_str_contains_extern_with_offsets(ip_hint, haystack_off, needle_off)
    }

    /// Phase H: shared "given already-popped i32 offsets, emit the
    /// extern shim call" backbone. Split out of
    /// [`Self::emit_str_contains_extern`] so the const-needle
    /// fast path can reuse the extern fallback for `needle.len() > 1`
    /// (where the inline byte-scan no longer wins over the shim's
    /// SIMD-backed Two-Way matcher).
    pub(crate) fn emit_str_contains_extern_with_offsets(
        &mut self,
        _ip_hint: &str,
        haystack_off: IntValue<'ctx>,
        needle_off: IntValue<'ctx>,
    ) -> Result<(), LlvmError> {
        // Resolve and bounds-check both full `[len][payload]` records
        // before handing raw header pointers to the host shim.
        let i32_t = self.ctx.i32_type();
        let i64_t = self.ctx.i64_type();
        let four64 = i64_t.const_int(4, false);

        let hay_hdr =
            self.arena_addr_i32_checked_const(haystack_off, 4, "contains haystack header")?;
        let hay_len = self
            .builder
            .build_load(i32_t, hay_hdr, &self.next_name("contains_hay_len"))
            .map_err(|e| LlvmError::Codegen(format!("contains haystack len: {e}")))?
            .into_int_value();
        let hay_len64 = self
            .builder
            .build_int_z_extend(hay_len, i64_t, &self.next_name("contains_hay_len64"))
            .map_err(|e| LlvmError::Codegen(format!("contains haystack len64: {e}")))?;
        let hay_record_len = self
            .builder
            .build_int_add(
                hay_len64,
                four64,
                &self.next_name("contains_hay_record_len"),
            )
            .map_err(|e| LlvmError::Codegen(format!("contains haystack record len: {e}")))?;
        let haystack_ptr =
            self.arena_addr_i32_checked(haystack_off, hay_record_len, "contains haystack record")?;

        let needle_hdr =
            self.arena_addr_i32_checked_const(needle_off, 4, "contains needle header")?;
        let needle_len = self
            .builder
            .build_load(i32_t, needle_hdr, &self.next_name("contains_needle_len"))
            .map_err(|e| LlvmError::Codegen(format!("contains needle len: {e}")))?
            .into_int_value();
        let needle_len64 = self
            .builder
            .build_int_z_extend(needle_len, i64_t, &self.next_name("contains_needle_len64"))
            .map_err(|e| LlvmError::Codegen(format!("contains needle len64: {e}")))?;
        let needle_record_len = self
            .builder
            .build_int_add(
                needle_len64,
                four64,
                &self.next_name("contains_needle_record_len"),
            )
            .map_err(|e| LlvmError::Codegen(format!("contains needle record len: {e}")))?;
        let needle_ptr =
            self.arena_addr_i32_checked(needle_off, needle_record_len, "contains needle record")?;

        // Declare (or look up) the extern shim. Idempotent so multiple
        // `contains` call sites in the same module share a single
        // declaration — LLVM's verifier rejects duplicate function
        // definitions but happily reuses an existing extern.
        let shim = self.declare_str_contains_extern();

        let call_name = self.next_name("str_contains_extern");
        let call_site = self
            .builder
            .build_call(
                shim,
                &[
                    BasicMetadataValueEnum::PointerValue(haystack_ptr),
                    BasicMetadataValueEnum::PointerValue(needle_ptr),
                ],
                &call_name,
            )
            .map_err(|e| LlvmError::Codegen(format!("str_contains call: {e}")))?;

        let ret_val = match call_site.try_as_basic_value() {
            inkwell::values::ValueKind::Basic(v) => v,
            inkwell::values::ValueKind::Instruction(_) => {
                return Err(LlvmError::Codegen(
                    "relon_llvm_str_contains_arena returned void; expected i32".into(),
                ));
            }
        };
        let ret_i32 = match ret_val {
            BasicValueEnum::IntValue(v) => v,
            other => {
                return Err(LlvmError::Codegen(format!(
                    "relon_llvm_str_contains_arena returned non-int {other:?}"
                )));
            }
        };
        // Bool is encoded as i32 (0 / 1) across the LLVM AOT envelope,
        // matching what the inlined `contains_string_body` would have
        // produced through `Op::Ne(I32)` against `0`. No truncation /
        // sign-extension needed — the shim returns the same 0/1 i32
        // shape downstream `BrIf` / `Eq(Bool)` consumers expect.
        self.push(ret_i32, IrType::Bool);
        Ok(())
    }

    /// Phase H: lower `contains(haystack, "literal") -> Bool` for the
    /// const-needle case detected at the `Op::Call` site.
    ///
    /// Operand stack contract: pops `needle_off` (top — discarded; we
    /// have the literal bytes), then `haystack_off`, pushes the i32
    /// 0/1 result as [`IrType::Bool`]. The needle's arena-record
    /// pointer is unused on the fast paths because we already know
    /// the bytes at compile time.
    ///
    /// Dispatch by needle length:
    /// - `0` — every haystack contains the empty string; push `i32(1)`
    ///   directly. Matches `core::str::contains("")`'s semantics and
    ///   the bundled stdlib body's `p_len == 0 → true` short-circuit.
    /// - `1` — emit an inline byte-scan loop against the cached
    ///   haystack record. LLVM 18's loop vectoriser recognises the
    ///   single-byte equality scan and lowers it to SSE2 `pcmpeqb` +
    ///   `pmovmskb` (the same SIMD memchr LuaJIT exploits via libc).
    ///   Skips the `relon_llvm_str_contains_arena` FFI boundary — no
    ///   IC atomic loads, no register save/restore, no spill of the
    ///   surrounding loop's IV / accumulator. Per-call cost drops
    ///   from ~5 ns (Phase G shim) to ~1.5-2 ns on x86_64. This is
    ///   the hot path for the W4 / W4_long cmp_lua rows (needle =
    ///   `"x"`).
    /// - `> 1` — fall through to the extern shim. The shim's
    ///   `compute_contains` uses `str::contains` with Rust's Two-Way
    ///   matcher; inlining that here would balloon the IR for no
    ///   measured win (the multi-byte case isn't on the W4 / W4_long
    ///   hot loop).
    pub(crate) fn emit_str_contains_const_needle(
        &mut self,
        ip_hint: &str,
        needle_bytes: &[u8],
    ) -> Result<(), LlvmError> {
        // Pop both operands up-front. For `len == 0` / `len == 1` we
        // discard `needle_off` — the inline path reads the needle byte
        // from the source-emitted `needle_bytes` slice. For `len > 1`
        // we forward both offsets to the shim path.
        let needle_off = self.pop_int(ip_hint)?;
        let haystack_off = self.pop_int(ip_hint)?;

        match needle_bytes.len() {
            0 => {
                // Empty needle: always matches. Push `i32(1)` typed as
                // Bool to match the inlined stdlib body's encoding.
                let one = self.ctx.i32_type().const_int(1, false);
                self.push(one, IrType::Bool);
                Ok(())
            }
            1 => self.emit_str_contains_inline_byte(ip_hint, haystack_off, needle_bytes[0]),
            _ => {
                // Multi-byte needle: shim with Two-Way matcher beats a
                // naive open-coded scan. Forward both offsets.
                self.emit_str_contains_extern_with_offsets(ip_hint, haystack_off, needle_off)
            }
        }
    }

    /// Phase H: emit a direct libc `memchr` call for the single-byte
    /// const-needle case. Pushes the i32 0/1 result tagged as
    /// [`IrType::Bool`].
    ///
    /// IR shape (haystack record at `arena_base + haystack_off` carries
    /// `[len_u32 LE][payload bytes]`):
    ///
    /// ```text
    /// hay_len   = load i32, ptr (arena_base + haystack_off)
    /// hay_payld = gep (arena_base + haystack_off + 4)
    /// hay_len64 = zext i32 hay_len -> i64
    /// res_ptr   = call ptr @memchr(ptr hay_payld, i32 needle_byte, i64 hay_len64)
    /// hit       = icmp ne ptr res_ptr, null
    /// result    = zext i1 hit -> i32
    /// ```
    ///
    /// ## Why direct libc memchr instead of an open-coded scan?
    ///
    /// LLVM 18's loop vectoriser refuses to vectorise the open-coded
    /// scan because the inner body has a data-dependent early exit
    /// (`if byte == needle break`). Without vectorisation the W4_long
    /// row's 256-byte haystack would walk byte-by-byte at ~1 ns / byte
    /// — a ~256 ns/iter regression vs the Phase G shim's SIMD-backed
    /// `core::slice::contains(&u8)` (which calls into the `memchr`
    /// crate's `memchr` function, in turn delegating to libc on
    /// Linux). Calling libc `memchr` directly gives us the same SIMD
    /// `pcmpeqb` + `pmovmskb` lowering glibc ships, *without* the
    /// Phase G shim's per-call IC + record-parsing overhead.
    ///
    /// ## Symbol resolution
    ///
    /// `memchr` is in libc, resolved by MCJIT's default `dlsym` lookup
    /// when the symbol is declared with [`Linkage::External`]. No
    /// explicit `engine.add_global_mapping` call is required (the
    /// Phase F.1 shim needed one because its symbol lives inside the
    /// relon-codegen-llvm dylib, which dlsym can't see from MCJIT).
    pub(crate) fn emit_str_contains_inline_byte(
        &mut self,
        _ip_hint: &str,
        haystack_off: IntValue<'ctx>,
        needle_byte: u8,
    ) -> Result<(), LlvmError> {
        let i32_t = self.ctx.i32_type();
        let i64_t = self.ctx.i64_type();
        let ptr_t = self.ctx.ptr_type(AddressSpace::default());
        let needle_arg = i32_t.const_int(u64::from(needle_byte), false);

        // Materialise haystack record header + payload pointer.
        let hay_hdr_ptr =
            self.arena_addr_i32_checked_const(haystack_off, 4, "contains-inline header")?;
        let hay_len_name = self.next_name("strc_inl_haylen");
        let hay_len = self
            .builder
            .build_load(i32_t, hay_hdr_ptr, &hay_len_name)
            .map_err(|e| LlvmError::Codegen(format!("str_contains_inline hay_len: {e}")))?
            .into_int_value();
        let hay_payload_ptr = self.arena_addr_i32_offset_checked(
            haystack_off,
            4,
            hay_len,
            "contains-inline payload",
        )?;
        let hay_len64_name = self.next_name("strc_inl_haylen64");
        let hay_len64 = self
            .builder
            .build_int_z_extend(hay_len, i64_t, &hay_len64_name)
            .map_err(|e| LlvmError::Codegen(format!("str_contains_inline hay_len64: {e}")))?;

        // Declare libc `memchr` once per module.
        let memchr_fn = self.declare_libc_memchr();
        let call_name = self.next_name("strc_inl_memchr");
        let call_site = self
            .builder
            .build_call(
                memchr_fn,
                &[
                    BasicMetadataValueEnum::PointerValue(hay_payload_ptr),
                    BasicMetadataValueEnum::IntValue(needle_arg),
                    BasicMetadataValueEnum::IntValue(hay_len64),
                ],
                &call_name,
            )
            .map_err(|e| LlvmError::Codegen(format!("str_contains_inline memchr call: {e}")))?;
        let res_ptr_basic = call_site.try_as_basic_value();
        let res_ptr = match res_ptr_basic {
            inkwell::values::ValueKind::Basic(BasicValueEnum::PointerValue(p)) => p,
            other => {
                return Err(LlvmError::Codegen(format!(
                    "memchr returned non-pointer: {other:?}"
                )));
            }
        };
        let null_ptr = ptr_t.const_null();
        let hit_name = self.next_name("strc_inl_hit");
        let hit_i1 = self
            .builder
            .build_int_compare(IntPredicate::NE, res_ptr, null_ptr, &hit_name)
            .map_err(|e| LlvmError::Codegen(format!("str_contains_inline cmp: {e}")))?;
        let res_name = self.next_name("strc_inl_res");
        let res_v = self
            .builder
            .build_int_z_extend(hit_i1, i32_t, &res_name)
            .map_err(|e| LlvmError::Codegen(format!("str_contains_inline zext: {e}")))?;
        self.push(res_v, IrType::Bool);
        Ok(())
    }

    /// Idempotent declaration of libc `memchr`. Returns the cached
    /// `FunctionValue` so callers can issue `build_call` without
    /// re-parsing the signature. MCJIT's default `dlsym` resolver
    /// picks up the libc symbol — no `engine.add_global_mapping` is
    /// required.
    pub(crate) fn declare_libc_memchr(&self) -> FunctionValue<'ctx> {
        const SYM: &str = "memchr";
        if let Some(f) = self.module.get_function(SYM) {
            return f;
        }
        let ptr_t = self.ctx.ptr_type(AddressSpace::default());
        let i32_t = self.ctx.i32_type();
        let i64_t = self.ctx.i64_type();
        // memchr signature: const void *memchr(const void *s, int c, size_t n)
        let fn_ty = ptr_t.fn_type(&[ptr_t.into(), i32_t.into(), i64_t.into()], false);
        self.module
            .add_function(SYM, fn_ty, Some(Linkage::External))
    }

    /// Idempotent declaration of the
    /// [`crate::str_helpers::relon_llvm_str_contains_arena`] extern.
    /// Returns the cached `FunctionValue` so callers can issue
    /// `build_call` without re-parsing the signature on every call site.
    pub(crate) fn declare_str_contains_extern(&self) -> FunctionValue<'ctx> {
        let sym = crate::str_helpers::RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL;
        if let Some(f) = self.module.get_function(sym) {
            return f;
        }
        let i32_t = self.ctx.i32_type();
        let ptr_t = self.ctx.ptr_type(AddressSpace::default());
        let fn_ty = i32_t.fn_type(&[ptr_t.into(), ptr_t.into()], false);
        self.module
            .add_function(sym, fn_ty, Some(Linkage::External))
    }

    /// Lower `Op::IntToStr` — pop one `I64`, materialise its base-10
    /// decimal `String` record in the scratch arena, push the i32
    /// record offset. Byte-exact with the tree-walker's `i64`
    /// `Display`: leading `-` for negatives, no leading zeros, `0` for
    /// zero, `i64::MIN` → `-9223372036854775808`. Mirrors cranelift's
    /// `emit_int_to_str` instruction-for-instruction (count digits,
    /// alloc `[len][digits]`, fill back-to-front, prepend sign). No
    /// libc itoa, so the wasm leg needs no extra import.
    pub(crate) fn emit_int_to_str(&mut self, ip_hint: &str) -> Result<(), LlvmError> {
        let i32_t = self.ctx.i32_type();
        let i64_t = self.ctx.i64_type();
        let i8_t = self.ctx.i8_type();
        let v = self.pop_int(ip_hint)?;

        let zero64 = i64_t.const_int(0, false);
        let ten64 = i64_t.const_int(10, false);
        let one32 = i32_t.const_int(1, false);
        let zero32 = i32_t.const_int(0, false);
        let four = i32_t.const_int(4, false);

        let cg = |e: inkwell::builder::BuilderError, what: &str| {
            LlvmError::Codegen(format!("IntToStr {what}: {e}"))
        };

        // is_neg = v < 0 (signed).
        let is_neg = self
            .builder
            .build_int_compare(IntPredicate::SLT, v, zero64, "i2s_isneg")
            .map_err(|e| cg(e, "isneg"))?;
        // mag = is_neg ? (0 - v) : v   (wrapping negate; correct for
        // i64::MIN). Reinterpreted unsigned for udiv/urem.
        let neg_v = self
            .builder
            .build_int_sub(zero64, v, "i2s_negv")
            .map_err(|e| cg(e, "negv"))?;
        let mag = self
            .builder
            .build_select(is_neg, neg_v, v, "i2s_mag")
            .map_err(|e| cg(e, "mag"))?
            .into_int_value();
        let sign_len = self
            .builder
            .build_select(is_neg, one32, zero32, "i2s_signlen")
            .map_err(|e| cg(e, "signlen"))?
            .into_int_value();

        // ---- Pass 1: count decimal digits of `mag` ----
        // cnt = 1; t = mag; while t >= 10 { t /= 10; cnt += 1 }
        let count_hdr = self.ctx.append_basic_block(self.func, "i2s_count_hdr");
        let count_body = self.ctx.append_basic_block(self.func, "i2s_count_body");
        let count_done = self.ctx.append_basic_block(self.func, "i2s_count_done");
        let pre_bb = self
            .builder
            .get_insert_block()
            .ok_or_else(|| LlvmError::Codegen("IntToStr: no insert block".into()))?;
        self.builder
            .build_unconditional_branch(count_hdr)
            .map_err(|e| cg(e, "to count_hdr"))?;

        self.builder.position_at_end(count_hdr);
        let t_phi = self
            .builder
            .build_phi(i64_t, "i2s_t")
            .map_err(|e| cg(e, "t phi"))?;
        let cnt_phi = self
            .builder
            .build_phi(i32_t, "i2s_cnt")
            .map_err(|e| cg(e, "cnt phi"))?;
        t_phi.add_incoming(&[(&mag, pre_bb)]);
        cnt_phi.add_incoming(&[(&one32, pre_bb)]);
        let t_val = t_phi.as_basic_value().into_int_value();
        let cnt_val = cnt_phi.as_basic_value().into_int_value();
        let cont = self
            .builder
            .build_int_compare(IntPredicate::UGE, t_val, ten64, "i2s_cont")
            .map_err(|e| cg(e, "cont"))?;
        self.builder
            .build_conditional_branch(cont, count_body, count_done)
            .map_err(|e| cg(e, "count br"))?;

        self.builder.position_at_end(count_body);
        let t_next = self
            .builder
            .build_int_unsigned_div(t_val, ten64, "i2s_tnext")
            .map_err(|e| cg(e, "tnext"))?;
        let cnt_next = self
            .builder
            .build_int_add(cnt_val, one32, "i2s_cntnext")
            .map_err(|e| cg(e, "cntnext"))?;
        t_phi.add_incoming(&[(&t_next, count_body)]);
        cnt_phi.add_incoming(&[(&cnt_next, count_body)]);
        self.builder
            .build_unconditional_branch(count_hdr)
            .map_err(|e| cg(e, "count loop back"))?;

        self.builder.position_at_end(count_done);
        let digit_count = cnt_val;
        // total_len = digit_count + sign_len.
        let total_len = self
            .builder
            .build_int_add(digit_count, sign_len, "i2s_totallen")
            .map_err(|e| cg(e, "totallen"))?;
        // record_size = (total_len + 4) rounded up to a 4-byte multiple
        // so the scratch cursor stays 4-aligned for the next record
        // (the return path aligns a String payload up to 4 bytes). See
        // the cranelift `emit_int_to_str` for the full rationale. The
        // header still stores the exact `total_len`.
        let raw_size = self
            .builder
            .build_int_add(total_len, four, "i2s_rawsize")
            .map_err(|e| cg(e, "rawsize"))?;
        let three = i32_t.const_int(3, false);
        let neg_four = i32_t.const_int((-4i64) as u64, false);
        let bumped = self
            .builder
            .build_int_add(raw_size, three, "i2s_bumped")
            .map_err(|e| cg(e, "bumped"))?;
        let record_size = self
            .builder
            .build_and(bumped, neg_four, "i2s_recsize")
            .map_err(|e| cg(e, "recsize"))?;

        // Allocate the record; pop its arena offset.
        self.emit_alloc_scratch_common(record_size)?;
        let base_off = self.pop_int(ip_hint)?;
        // Header: store total_len at base.
        let base_abs = self.arena_addr_i32_checked_const(base_off, 4, "IntToStr header")?;
        self.builder
            .build_store(base_abs, total_len)
            .map_err(|e| cg(e, "header store"))?;

        // payload_off = base_off + 4; digits_off = payload_off + sign_len;
        // end_off = digits_off + digit_count (one past last digit).
        let payload_off = self
            .builder
            .build_int_add(base_off, four, "i2s_payoff")
            .map_err(|e| cg(e, "payoff"))?;
        let digits_off = self
            .builder
            .build_int_add(payload_off, sign_len, "i2s_digoff")
            .map_err(|e| cg(e, "digoff"))?;
        let end_off = self
            .builder
            .build_int_add(digits_off, digit_count, "i2s_endoff")
            .map_err(|e| cg(e, "endoff"))?;

        // ---- Pass 2: fill digits back-to-front ----
        // m = mag; cursor = end_off;
        // do { d = m % 10; cursor -= 1; store '0'+d at cursor; m /= 10 }
        // while m != 0
        let write_hdr = self.ctx.append_basic_block(self.func, "i2s_write_hdr");
        let write_done = self.ctx.append_basic_block(self.func, "i2s_write_done");
        let write_pre = self
            .builder
            .get_insert_block()
            .ok_or_else(|| LlvmError::Codegen("IntToStr: no write-pre block".into()))?;
        self.builder
            .build_unconditional_branch(write_hdr)
            .map_err(|e| cg(e, "to write_hdr"))?;

        self.builder.position_at_end(write_hdr);
        let m_phi = self
            .builder
            .build_phi(i64_t, "i2s_m")
            .map_err(|e| cg(e, "m phi"))?;
        let cur_phi = self
            .builder
            .build_phi(i32_t, "i2s_cur")
            .map_err(|e| cg(e, "cur phi"))?;
        m_phi.add_incoming(&[(&mag, write_pre)]);
        cur_phi.add_incoming(&[(&end_off, write_pre)]);
        let m_val = m_phi.as_basic_value().into_int_value();
        let cur_val = cur_phi.as_basic_value().into_int_value();
        let rem = self
            .builder
            .build_int_unsigned_rem(m_val, ten64, "i2s_rem")
            .map_err(|e| cg(e, "rem"))?;
        let rem32 = self
            .builder
            .build_int_truncate(rem, i32_t, "i2s_rem32")
            .map_err(|e| cg(e, "rem32"))?;
        let ascii0 = i32_t.const_int(u64::from(b'0'), false);
        let ch = self
            .builder
            .build_int_add(rem32, ascii0, "i2s_ch")
            .map_err(|e| cg(e, "ch"))?;
        let ch8 = self
            .builder
            .build_int_truncate(ch, i8_t, "i2s_ch8")
            .map_err(|e| cg(e, "ch8"))?;
        let cur_next = self
            .builder
            .build_int_sub(cur_val, one32, "i2s_curnext")
            .map_err(|e| cg(e, "curnext"))?;
        let ch_abs = self.arena_addr_i32_checked_const(cur_next, 1, "IntToStr digit")?;
        self.builder
            .build_store(ch_abs, ch8)
            .map_err(|e| cg(e, "digit store"))?;
        let m_next = self
            .builder
            .build_int_unsigned_div(m_val, ten64, "i2s_mnext")
            .map_err(|e| cg(e, "mnext"))?;
        let more = self
            .builder
            .build_int_compare(IntPredicate::NE, m_next, zero64, "i2s_more")
            .map_err(|e| cg(e, "more"))?;
        let write_body_end = self
            .builder
            .get_insert_block()
            .ok_or_else(|| LlvmError::Codegen("IntToStr: no write body block".into()))?;
        m_phi.add_incoming(&[(&m_next, write_body_end)]);
        cur_phi.add_incoming(&[(&cur_next, write_body_end)]);
        self.builder
            .build_conditional_branch(more, write_hdr, write_done)
            .map_err(|e| cg(e, "write br"))?;

        self.builder.position_at_end(write_done);
        // Prepend '-' at payload_off when negative.
        let minus_body = self.ctx.append_basic_block(self.func, "i2s_minus_body");
        let minus_done = self.ctx.append_basic_block(self.func, "i2s_minus_done");
        self.builder
            .build_conditional_branch(is_neg, minus_body, minus_done)
            .map_err(|e| cg(e, "minus br"))?;
        self.builder.position_at_end(minus_body);
        let minus_abs = self.arena_addr_i32_checked_const(payload_off, 1, "IntToStr minus")?;
        let minus8 = i8_t.const_int(u64::from(b'-'), false);
        self.builder
            .build_store(minus_abs, minus8)
            .map_err(|e| cg(e, "minus store"))?;
        self.builder
            .build_unconditional_branch(minus_done)
            .map_err(|e| cg(e, "minus to done"))?;
        self.builder.position_at_end(minus_done);

        self.push(base_off, IrType::String);
        Ok(())
    }

    /// Idempotent declaration of the
    /// [`crate::str_helpers::relon_llvm_f64_to_str`] extern. Signature:
    /// `i32 fn(i64 bits, ptr dest)`. On the native leg the symbol is
    /// resolved through `engine.add_global_mapping`; on the wasm32 leg
    /// the unresolved extern survives `wasm-ld --allow-undefined` as
    /// `(import "env" "relon_llvm_f64_to_str" ...)` and the host
    /// `func_wrap`s the same Rust fn — one Display byte producer for
    /// every compiled backend.
    pub(crate) fn declare_f64_to_str_extern(&self) -> FunctionValue<'ctx> {
        let sym = crate::str_helpers::RELON_LLVM_F64_TO_STR_SYMBOL;
        if let Some(f) = self.module.get_function(sym) {
            return f;
        }
        let i32_t = self.ctx.i32_type();
        let i64_t = self.ctx.i64_type();
        let ptr_t = self.ctx.ptr_type(AddressSpace::default());
        let fn_ty = i32_t.fn_type(&[i64_t.into(), ptr_t.into()], false);
        self.module
            .add_function(sym, fn_ty, Some(Linkage::External))
    }

    /// Lower `Op::FloatToStr` — pop one `F64` (riding the virtual stack
    /// as raw IEEE-754 bits in an i64, per the AOT-1 convention),
    /// materialise its Rust-`Display` `String` record in the scratch
    /// arena via the [`crate::str_helpers::relon_llvm_f64_to_str`] host
    /// shim, push the i32 record offset. The shim defers to
    /// `relon_ir::float_str::format_f64_display` — the exact `format!`
    /// path the tree-walk oracle's `Value::Float` Display uses — so the
    /// bytes are equal by construction (`1.0 → "1"`, `-0.0 → "-0"`,
    /// `NaN` / `inf` / `-inf`, full subnormal expansion).
    ///
    /// Unlike `IntToStr` (open-coded digit loop, no import) this leg
    /// costs a host call: float Display is grisu/ryū-class shortest-
    /// round-trip formatting, far too large to transcribe per-backend
    /// without inviting byte drift. The wasm leg pays one `env` import.
    ///
    /// A negative shim return (null dest / payload over the cap — both
    /// unreachable by construction since the record is a bounds-checked
    /// scratch reservation of `FLOAT_TO_STR_RECORD_SIZE` bytes) traps
    /// loudly instead of pushing a corrupt record, mirroring
    /// cranelift's `emit_float_to_str` bounds trap.
    pub(crate) fn emit_float_to_str(&mut self, ip_hint: &str) -> Result<(), LlvmError> {
        use relon_ir::float_str::FLOAT_TO_STR_RECORD_SIZE;
        let cg = |e: inkwell::builder::BuilderError, what: &str| {
            LlvmError::Codegen(format!("FloatToStr {what}: {e}"))
        };
        // F64 rides the virtual stack as bit-cast i64 — exactly the i64
        // lane the shim's `bits: i64` parameter expects. No fpext /
        // bitcast needed at the call edge.
        let bits = self.pop_int(ip_hint)?;

        // Fixed-size record: worst-case Display payload (327 bytes for
        // -5e-324) + header fits 768 with margin; statically asserted
        // in `relon_ir::float_str`.
        self.emit_alloc_scratch_static(FLOAT_TO_STR_RECORD_SIZE)?;
        let base_off = self.pop_int(ip_hint)?;
        let dest_ptr =
            self.arena_addr_i32_checked_const(base_off, FLOAT_TO_STR_RECORD_SIZE, "FloatToStr")?;

        let shim = self.declare_f64_to_str_extern();
        let call_name = self.next_name("f64_to_str");
        let call_site = self
            .builder
            .build_call(
                shim,
                &[
                    BasicMetadataValueEnum::IntValue(bits),
                    BasicMetadataValueEnum::PointerValue(dest_ptr),
                ],
                &call_name,
            )
            .map_err(|e| cg(e, "call"))?;
        let ret_val = match call_site.try_as_basic_value() {
            inkwell::values::ValueKind::Basic(v) => v,
            inkwell::values::ValueKind::Instruction(_) => {
                return Err(LlvmError::Codegen(
                    "relon_llvm_f64_to_str returned void; expected i32".into(),
                ));
            }
        };
        let written = match ret_val {
            BasicValueEnum::IntValue(v) => v,
            other => {
                return Err(LlvmError::Codegen(format!(
                    "relon_llvm_f64_to_str returned non-int {other:?}"
                )));
            }
        };

        // written < 0 → loud trap (never a silent corrupt record).
        let zero32 = self.ctx.i32_type().const_int(0, false);
        let failed = self
            .builder
            .build_int_compare(IntPredicate::SLT, written, zero32, "f2s_failed")
            .map_err(|e| cg(e, "fail cmp"))?;
        let trap_bb = self.ctx.append_basic_block(self.func, "f2s_trap");
        let cont_bb = self.ctx.append_basic_block(self.func, "f2s_ok");
        self.builder
            .build_conditional_branch(failed, trap_bb, cont_bb)
            .map_err(|e| cg(e, "fail branch"))?;
        self.builder.position_at_end(trap_bb);
        self.emit_llvm_trap_call("FloatToStr")?;
        self.builder
            .build_unreachable()
            .map_err(|e| cg(e, "trap unreachable"))?;
        self.builder.position_at_end(cont_bb);

        self.push(base_off, IrType::String);
        Ok(())
    }
}