cranelift_codegen/isa/x64/inst/
emit.rs

1use crate::ir::KnownSymbol;
2use crate::ir::immediates::{Ieee32, Ieee64};
3use crate::isa::x64::external::{AsmInst, CraneliftRegisters, PairedGpr};
4use crate::isa::x64::inst::args::*;
5use crate::isa::x64::inst::*;
6use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};
7use cranelift_assembler_x64 as asm;
8
9/// A small helper to generate a signed conversion instruction.
10fn emit_signed_cvt(
11    sink: &mut MachBuffer<Inst>,
12    info: &EmitInfo,
13    state: &mut EmitState,
14    src: Reg,
15    dst: Writable<Reg>,
16    to_f64: bool,
17) {
18    assert!(src.is_real());
19    assert!(dst.to_reg().is_real());
20
21    // Handle an unsigned int, which is the "easy" case: a signed conversion
22    // will do the right thing.
23    let dst = WritableXmm::from_writable_reg(dst).unwrap();
24    if to_f64 {
25        asm::inst::cvtsi2sdq_a::new(dst, src).emit(sink, info, state);
26    } else {
27        asm::inst::cvtsi2ssq_a::new(dst, src).emit(sink, info, state);
28    }
29}
30
31/// Emits a one way conditional jump if CC is set (true).
32fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
33    let cond_start = sink.cur_offset();
34    let cond_disp_off = cond_start + 2;
35    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
36    emit_jcc_no_offset(sink, cc);
37    debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
38}
39
40/// Like `one_way_jmp` above emitting a conditional jump, but also using
41/// `MachBuffer::add_cond_branch`.
42fn cond_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
43    let cond_start = sink.cur_offset();
44    let cond_disp_off = cond_start + 2;
45    let cond_end = cond_start + 6;
46
47    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
48    // FIXME: ideally this `inverted` calculation would go through the external
49    // assembler, but for now it's left done manually.
50    let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
51    sink.add_cond_branch(cond_start, cond_end, label, &inverted[..]);
52
53    emit_jcc_no_offset(sink, cc);
54
55    debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
56    debug_assert_eq!(sink.cur_offset(), cond_end);
57}
58
59fn emit_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {
60    // Note that the disassembler matches Capstone which doesn't match the `CC`
61    // enum directly as Intel has multiple mnemonics use the same encoding.
62    let inst: AsmInst = match cc {
63        CC::Z => asm::inst::je_d32::new(0).into(),   // jz == je
64        CC::NZ => asm::inst::jne_d32::new(0).into(), // jnz == jne
65        CC::B => asm::inst::jb_d32::new(0).into(),
66        CC::NB => asm::inst::jae_d32::new(0).into(), // jnb == jae
67        CC::BE => asm::inst::jbe_d32::new(0).into(),
68        CC::NBE => asm::inst::ja_d32::new(0).into(), // jnbe == ja
69        CC::L => asm::inst::jl_d32::new(0).into(),
70        CC::LE => asm::inst::jle_d32::new(0).into(),
71        CC::NL => asm::inst::jge_d32::new(0).into(), // jnl == jge
72        CC::NLE => asm::inst::jg_d32::new(0).into(), // jnle == jg
73        CC::O => asm::inst::jo_d32::new(0).into(),
74        CC::NO => asm::inst::jno_d32::new(0).into(),
75        CC::P => asm::inst::jp_d32::new(0).into(),
76        CC::NP => asm::inst::jnp_d32::new(0).into(),
77        CC::S => asm::inst::js_d32::new(0).into(),
78        CC::NS => asm::inst::jns_d32::new(0).into(),
79    };
80    inst.encode(&mut external::AsmCodeSink {
81        sink,
82        incoming_arg_offset: 0,
83        slot_offset: 0,
84    });
85}
86
87/// Emits an unconditional branch.
88fn uncond_jmp(sink: &mut MachBuffer<Inst>, label: MachLabel) {
89    let uncond_start = sink.cur_offset();
90    let uncond_disp_off = uncond_start + 1;
91    let uncond_end = uncond_start + 5;
92
93    sink.use_label_at_offset(uncond_disp_off, label, LabelUse::JmpRel32);
94    sink.add_uncond_branch(uncond_start, uncond_end, label);
95
96    asm::inst::jmp_d32::new(0).encode(&mut external::AsmCodeSink {
97        sink,
98        incoming_arg_offset: 0,
99        slot_offset: 0,
100    });
101    debug_assert_eq!(sink.cur_offset(), uncond_disp_off + 4);
102    debug_assert_eq!(sink.cur_offset(), uncond_end);
103}
104
105/// Emits a relocation, attaching the current source location as well.
106fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {
107    sink.add_reloc(kind, name, addend);
108}
109
110/// The top-level emit function.
111///
112/// Important!  Do not add improved (shortened) encoding cases to existing
113/// instructions without also adding tests for those improved encodings.  That
114/// is a dangerous game that leads to hard-to-track-down errors in the emitted
115/// code.
116///
117/// For all instructions, make sure to have test coverage for all of the
118/// following situations.  Do this by creating the cross product resulting from
119/// applying the following rules to each operand:
120///
121/// (1) for any insn that mentions a register: one test using a register from
122///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
123///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
124///     This helps detect incorrect REX prefix construction.
125///
126/// (2) for any insn that mentions a byte register: one test for each of the
127///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
128///     [r8b .. r11b] and [r12b .. r15b].  This checks that
129///     apparently-redundant REX prefixes are retained when required.
130///
131/// (3) for any insn that contains an immediate field, check the following
132///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
133///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
134///     instructions that require a 32-bit immediate have a short-form encoding
135///     when the imm is in simm8 range.
136///
137/// Rules (1), (2) and (3) don't apply for registers within address expressions
138/// (`Addr`s).  Those are already pretty well tested, and the registers in them
139/// don't have any effect on the containing instruction (apart from possibly
140/// require REX prefix bits).
141///
142/// When choosing registers for a test, avoid using registers with the same
143/// offset within a given group.  For example, don't use rax and r8, since they
144/// both have the lowest 3 bits as 000, and so the test won't detect errors
145/// where those 3-bit register sub-fields are confused by the emitter.  Instead
146/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
147/// and bpl since they have the same offset in their group; use instead (eg) cl
148/// and sil.
149///
150/// For all instructions, also add a test that uses only low-half registers
151/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
152/// prefixes are correctly omitted.  This low-half restriction must apply to
153/// _all_ registers in the insn, even those in address expressions.
154///
155/// Following these rules creates large numbers of test cases, but it's the
156/// only way to make the emitter reliable.
157///
158/// Known possible improvements:
159///
160/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
161///   care?)
162pub(crate) fn emit(
163    inst: &Inst,
164    sink: &mut MachBuffer<Inst>,
165    info: &EmitInfo,
166    state: &mut EmitState,
167) {
168    if !inst.is_available(&info) {
169        let features = if let Inst::External { inst } = inst {
170            inst.features().to_string()
171        } else {
172            "see `is_available` source for feature term".to_string()
173        };
174        panic!(
175            "Cannot emit inst '{inst:?}' for target; failed to match ISA requirements: {features}"
176        );
177    }
178
179    match inst {
180        Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {
181            // Validate that the register constraints of the dividend and the
182            // destination are all as expected.
183            let (dst, size) = match inst {
184                Inst::CheckedSRemSeq {
185                    dividend_lo,
186                    dividend_hi,
187                    dst_quotient,
188                    dst_remainder,
189                    size,
190                    ..
191                } => {
192                    let dividend_lo = dividend_lo.to_reg();
193                    let dividend_hi = dividend_hi.to_reg();
194                    let dst_quotient = dst_quotient.to_reg().to_reg();
195                    let dst_remainder = dst_remainder.to_reg().to_reg();
196                    debug_assert_eq!(dividend_lo, regs::rax());
197                    debug_assert_eq!(dividend_hi, regs::rdx());
198                    debug_assert_eq!(dst_quotient, regs::rax());
199                    debug_assert_eq!(dst_remainder, regs::rdx());
200                    (regs::rdx(), *size)
201                }
202                Inst::CheckedSRemSeq8 { dividend, dst, .. } => {
203                    let dividend = dividend.to_reg();
204                    let dst = dst.to_reg().to_reg();
205                    debug_assert_eq!(dividend, regs::rax());
206                    debug_assert_eq!(dst, regs::rax());
207                    (regs::rax(), OperandSize::Size8)
208                }
209                _ => unreachable!(),
210            };
211
212            // Generates the following code sequence:
213            //
214            // cmp -1 %divisor
215            // jnz $do_op
216            //
217            // ;; for srem, result is 0
218            // mov #0, %dst
219            // j $done
220            //
221            // $do_op:
222            // idiv %divisor
223            //
224            // $done:
225
226            let do_op = sink.get_label();
227            let done_label = sink.get_label();
228
229            // Check if the divisor is -1, and if it isn't then immediately
230            // go to the `idiv`.
231            let inst = Inst::cmp_mi_sxb(size, *divisor, -1);
232            inst.emit(sink, info, state);
233            one_way_jmp(sink, CC::NZ, do_op);
234
235            // ... otherwise the divisor is -1 and the result is always 0. This
236            // is written to the destination register which will be %rax for
237            // 8-bit srem and %rdx otherwise.
238            //
239            // Note that for 16-to-64-bit srem operations this leaves the
240            // second destination, %rax, unchanged. This isn't semantically
241            // correct if a lowering actually tries to use the `dst_quotient`
242            // output but for srem only the `dst_remainder` output is used for
243            // now.
244            let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));
245            inst.emit(sink, info, state);
246            let inst = Inst::jmp_known(done_label);
247            inst.emit(sink, info, state);
248
249            // Here the `idiv` is executed, which is different depending on the
250            // size
251            sink.bind_label(do_op, state.ctrl_plane_mut());
252            let rax = Gpr::RAX;
253            let rdx = Gpr::RDX;
254            let writable_rax = Writable::from_reg(rax);
255            let writable_rdx = Writable::from_reg(rdx);
256            let inst: AsmInst = match size {
257                OperandSize::Size8 => asm::inst::idivb_m::new(
258                    PairedGpr::from(writable_rax),
259                    *divisor,
260                    TrapCode::INTEGER_DIVISION_BY_ZERO,
261                )
262                .into(),
263
264                OperandSize::Size16 => asm::inst::idivw_m::new(
265                    PairedGpr::from(writable_rax),
266                    PairedGpr::from(writable_rdx),
267                    *divisor,
268                    TrapCode::INTEGER_DIVISION_BY_ZERO,
269                )
270                .into(),
271
272                OperandSize::Size32 => asm::inst::idivl_m::new(
273                    PairedGpr::from(writable_rax),
274                    PairedGpr::from(writable_rdx),
275                    *divisor,
276                    TrapCode::INTEGER_DIVISION_BY_ZERO,
277                )
278                .into(),
279
280                OperandSize::Size64 => asm::inst::idivq_m::new(
281                    PairedGpr::from(writable_rax),
282                    PairedGpr::from(writable_rdx),
283                    *divisor,
284                    TrapCode::INTEGER_DIVISION_BY_ZERO,
285                )
286                .into(),
287            };
288            inst.emit(sink, info, state);
289
290            sink.bind_label(done_label, state.ctrl_plane_mut());
291        }
292
293        Inst::MovFromPReg { src, dst } => {
294            let src: Reg = (*src).into();
295            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));
296            asm::inst::movq_mr::new(*dst, Gpr::unwrap_new(src)).emit(sink, info, state);
297        }
298
299        Inst::MovToPReg { src, dst } => {
300            let dst: Reg = (*dst).into();
301            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));
302            let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
303            asm::inst::movq_mr::new(dst, *src).emit(sink, info, state);
304        }
305
306        Inst::XmmCmove {
307            ty,
308            cc,
309            consequent,
310            alternative,
311            dst,
312        } => {
313            let alternative = *alternative;
314            let dst = *dst;
315            debug_assert_eq!(alternative, dst.to_reg());
316            let consequent = *consequent;
317
318            // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
319            // this doesn't clobber flags. Make sure to not do so here.
320            let next = sink.get_label();
321
322            // Jump if cc is *not* set.
323            one_way_jmp(sink, cc.invert(), next);
324            Inst::gen_move(dst.map(|r| r.to_reg()), consequent.to_reg(), *ty)
325                .emit(sink, info, state);
326
327            sink.bind_label(next, state.ctrl_plane_mut());
328        }
329
330        Inst::StackProbeLoop {
331            tmp,
332            frame_size,
333            guard_size,
334        } => {
335            assert!(info.flags.enable_probestack());
336            assert!(guard_size.is_power_of_two());
337
338            let tmp = *tmp;
339
340            // Number of probes that we need to perform
341            let probe_count = align_to(*frame_size, *guard_size) / guard_size;
342
343            // The inline stack probe loop has 3 phases:
344            //
345            // We generate the "guard area" register which is essentially the frame_size aligned to
346            // guard_size. We copy the stack pointer and subtract the guard area from it. This
347            // gets us a register that we can use to compare when looping.
348            //
349            // After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
350            // distance at a time and then touch the stack by writing anything to it. We use the previously
351            // created "guard area" register to know when to stop looping.
352            //
353            // When we have touched all the pages that we need, we have to restore the stack pointer
354            // to where it was before.
355            //
356            // Generate the following code:
357            //         mov  tmp_reg, rsp
358            //         sub  tmp_reg, guard_size * probe_count
359            // .loop_start:
360            //         sub  rsp, guard_size
361            //         mov  [rsp], rsp
362            //         cmp  rsp, tmp_reg
363            //         jne  .loop_start
364            //         add  rsp, guard_size * probe_count
365
366            // Create the guard bound register
367            // mov  tmp_reg, rsp
368            let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
369            inst.emit(sink, info, state);
370
371            // sub  tmp_reg, GUARD_SIZE * probe_count
372            let guard_plus_count = i32::try_from(guard_size * probe_count)
373                .expect("`guard_size * probe_count` is too large to fit in a 32-bit immediate");
374            Inst::subq_mi(tmp, guard_plus_count).emit(sink, info, state);
375
376            // Emit the main loop!
377            let loop_start = sink.get_label();
378            sink.bind_label(loop_start, state.ctrl_plane_mut());
379
380            // sub  rsp, GUARD_SIZE
381            let rsp = Writable::from_reg(regs::rsp());
382            let guard_size_ = i32::try_from(*guard_size)
383                .expect("`guard_size` is too large to fit in a 32-bit immediate");
384            Inst::subq_mi(rsp, guard_size_).emit(sink, info, state);
385
386            // TODO: `mov [rsp], 0` would be better, but we don't have that instruction
387            // Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
388            // instruction size.
389            // mov  [rsp], rsp
390            asm::inst::movl_mr::new(Amode::imm_reg(0, regs::rsp()), Gpr::RSP)
391                .emit(sink, info, state);
392
393            // Compare and jump if we are not done yet
394            // cmp  rsp, tmp_reg
395            let tmp = Gpr::unwrap_new(tmp.to_reg());
396            asm::inst::cmpq_rm::new(tmp, Gpr::RSP).emit(sink, info, state);
397
398            // jne  .loop_start
399            // TODO: Encoding the conditional jump as a short jump
400            // could save us us 4 bytes here.
401            one_way_jmp(sink, CC::NZ, loop_start);
402
403            // The regular prologue code is going to emit a `sub` after this, so we need to
404            // reset the stack pointer
405            //
406            // TODO: It would be better if we could avoid the `add` + `sub` that is generated here
407            // and in the stack adj portion of the prologue
408            //
409            // add rsp, GUARD_SIZE * probe_count
410            Inst::addq_mi(rsp, guard_plus_count).emit(sink, info, state);
411        }
412
413        Inst::CallKnown { info: call_info } => {
414            let stack_map = state.take_stack_map();
415
416            asm::inst::callq_d::new(0).emit(sink, info, state);
417
418            // The last 4 bytes of `callq` is the relative displacement to where
419            // we're calling, so that's where the reloc is registered.
420            //
421            // The addend adjusts for the difference between the end of the
422            // instruction and the beginning of the immediate field.
423            let len = sink.cur_offset();
424            sink.add_reloc_at_offset(len - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
425
426            if let Some(s) = stack_map {
427                sink.push_user_stack_map(state, len, s);
428            }
429
430            if let Some(try_call) = call_info.try_call_info.as_ref() {
431                sink.add_try_call_site(
432                    Some(state.frame_layout().sp_to_fp()),
433                    try_call.exception_handlers(&state.frame_layout()),
434                );
435            } else {
436                sink.add_call_site();
437            }
438
439            // Reclaim the outgoing argument area that was released by the
440            // callee, to ensure that StackAMode values are always computed from
441            // a consistent SP.
442            if call_info.callee_pop_size > 0 {
443                let rsp = Writable::from_reg(regs::rsp());
444                let callee_pop_size = i32::try_from(call_info.callee_pop_size)
445                    .expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
446                Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
447            }
448
449            // Load any stack-carried return values.
450            call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
451                state.frame_layout().stackslots_size,
452                |inst| inst.emit(sink, info, state),
453                |_space_needed| None,
454            );
455
456            // If this is a try-call, jump to the continuation
457            // (normal-return) block.
458            if let Some(try_call) = call_info.try_call_info.as_ref() {
459                let jmp = Inst::JmpKnown {
460                    dst: try_call.continuation,
461                };
462                jmp.emit(sink, info, state);
463            }
464        }
465
466        Inst::ReturnCallKnown { info: call_info } => {
467            emit_return_call_common_sequence(sink, info, state, &call_info);
468
469            // Finally, jump to the callee!
470            //
471            // Note: this is not `Inst::Jmp { .. }.emit(..)` because we have
472            // different metadata in this case: we don't have a label for the
473            // target, but rather a function relocation.
474            asm::inst::jmp_d32::new(0).emit(sink, info, state);
475            let offset = sink.cur_offset();
476            // The addend adjusts for the difference between the end of the instruction and the
477            // beginning of the immediate field.
478            sink.add_reloc_at_offset(offset - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
479            sink.add_call_site();
480        }
481
482        Inst::ReturnCallUnknown { info: call_info } => {
483            let callee = call_info.dest;
484
485            emit_return_call_common_sequence(sink, info, state, &call_info);
486
487            asm::inst::jmpq_m::new(callee).emit(sink, info, state);
488            sink.add_call_site();
489        }
490
491        Inst::CallUnknown {
492            info: call_info, ..
493        } => {
494            let stack_map = state.take_stack_map();
495
496            let dest = match call_info.dest.clone() {
497                RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::unwrap_new(reg)),
498                RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),
499            };
500
501            asm::inst::callq_m::new(dest).emit(sink, info, state);
502
503            if let Some(s) = stack_map {
504                let offset = sink.cur_offset();
505                sink.push_user_stack_map(state, offset, s);
506            }
507
508            if let Some(try_call) = call_info.try_call_info.as_ref() {
509                sink.add_try_call_site(
510                    Some(state.frame_layout().sp_to_fp()),
511                    try_call.exception_handlers(&state.frame_layout()),
512                );
513            } else {
514                sink.add_call_site();
515            }
516
517            // Reclaim the outgoing argument area that was released by the callee, to ensure that
518            // StackAMode values are always computed from a consistent SP.
519            if call_info.callee_pop_size > 0 {
520                let rsp = Writable::from_reg(regs::rsp());
521                let callee_pop_size = i32::try_from(call_info.callee_pop_size)
522                    .expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
523                Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
524            }
525
526            // Load any stack-carried return values.
527            call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
528                state.frame_layout().stackslots_size,
529                |inst| inst.emit(sink, info, state),
530                |_space_needed| None,
531            );
532
533            if let Some(try_call) = call_info.try_call_info.as_ref() {
534                let jmp = Inst::JmpKnown {
535                    dst: try_call.continuation,
536                };
537                jmp.emit(sink, info, state);
538            }
539        }
540
541        Inst::Args { .. } => {}
542        Inst::Rets { .. } => {}
543
544        Inst::StackSwitchBasic {
545            store_context_ptr,
546            load_context_ptr,
547            in_payload0,
548            out_payload0,
549        } => {
550            // Note that we do not emit anything for preserving and restoring
551            // ordinary registers here: That's taken care of by regalloc for us,
552            // since we marked this instruction as clobbering all registers.
553            //
554            // Also note that we do nothing about passing the single payload
555            // value: We've informed regalloc that it is sent and received via
556            // the fixed register given by [stack_switch::payload_register]
557
558            let (tmp1, tmp2) = {
559                // Ideally we would just ask regalloc for two temporary registers.
560                // However, adding any early defs to the constraints on StackSwitch
561                // causes TooManyLiveRegs. Fortunately, we can manually find tmp
562                // registers without regalloc: Since our instruction clobbers all
563                // registers, we can simply pick any register that is not assigned
564                // to the operands.
565
566                let all = crate::isa::x64::abi::ALL_CLOBBERS;
567
568                let used_regs = [
569                    **load_context_ptr,
570                    **store_context_ptr,
571                    **in_payload0,
572                    *out_payload0.to_reg(),
573                ];
574
575                let mut tmps = all.into_iter().filter_map(|preg| {
576                    let reg: Reg = preg.into();
577                    if !used_regs.contains(&reg) {
578                        WritableGpr::from_writable_reg(isle::WritableReg::from_reg(reg))
579                    } else {
580                        None
581                    }
582                });
583                (tmps.next().unwrap(), tmps.next().unwrap())
584            };
585
586            let layout = stack_switch::control_context_layout();
587            let rsp_offset = layout.stack_pointer_offset as i32;
588            let pc_offset = layout.ip_offset as i32;
589            let rbp_offset = layout.frame_pointer_offset as i32;
590
591            // Location to which someone switch-ing back to this stack will jump
592            // to: Right behind the `StackSwitch` instruction
593            let resume = sink.get_label();
594
595            //
596            // For RBP and RSP we do the following:
597            // - Load new value for register from `load_context_ptr` +
598            // corresponding offset.
599            // - Store previous (!) value of register at `store_context_ptr` +
600            // corresponding offset.
601            //
602            // Since `load_context_ptr` and `store_context_ptr` are allowed to be
603            // equal, we need to use a temporary register here.
604            //
605
606            let mut exchange = |offset, reg| {
607                let addr = SyntheticAmode::real(Amode::imm_reg(offset, **load_context_ptr));
608                asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
609
610                asm::inst::movq_mr::new(
611                    Amode::imm_reg(offset, **store_context_ptr),
612                    Gpr::new(reg).unwrap(),
613                )
614                .emit(sink, info, state);
615
616                let dst = Writable::from_reg(reg);
617                asm::inst::movq_mr::new(dst.map(Gpr::unwrap_new), tmp1.to_reg())
618                    .emit(sink, info, state);
619            };
620
621            exchange(rsp_offset, regs::rsp());
622            exchange(rbp_offset, regs::rbp());
623
624            //
625            // Load target PC, store resume PC, jump to target PC
626            //
627
628            let addr = SyntheticAmode::real(Amode::imm_reg(pc_offset, **load_context_ptr));
629            asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
630
631            let amode = Amode::RipRelative { target: resume };
632            asm::inst::leaq_rm::new(tmp2, amode).emit(sink, info, state);
633
634            asm::inst::movq_mr::new(
635                Amode::imm_reg(pc_offset, **store_context_ptr),
636                tmp2.to_reg(),
637            )
638            .emit(sink, info, state);
639
640            asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
641
642            sink.bind_label(resume, state.ctrl_plane_mut());
643        }
644
645        Inst::JmpKnown { dst } => uncond_jmp(sink, *dst),
646
647        Inst::WinchJmpIf { cc, taken } => one_way_jmp(sink, *cc, *taken),
648
649        Inst::JmpCond {
650            cc,
651            taken,
652            not_taken,
653        } => {
654            cond_jmp(sink, *cc, *taken);
655            uncond_jmp(sink, *not_taken);
656        }
657
658        Inst::JmpCondOr {
659            cc1,
660            cc2,
661            taken,
662            not_taken,
663        } => {
664            // Emit:
665            //   jcc1 taken
666            //   jcc2 taken
667            //   jmp not_taken
668            //
669            // Note that we enroll both conditionals in the
670            // branch-chomping mechanism because MachBuffer
671            // simplification can continue upward as long as it keeps
672            // chomping branches. In the best case, if taken ==
673            // not_taken and that one block is the fallthrough block,
674            // all three branches can disappear.
675
676            cond_jmp(sink, *cc1, *taken);
677            cond_jmp(sink, *cc2, *taken);
678            uncond_jmp(sink, *not_taken);
679        }
680
681        &Inst::JmpTableSeq {
682            idx,
683            tmp1,
684            tmp2,
685            ref targets,
686            ref default_target,
687            ..
688        } => {
689            // This sequence is *one* instruction in the vcode, and is expanded only here at
690            // emission time, because we cannot allow the regalloc to insert spills/reloads in
691            // the middle; we depend on hardcoded PC-rel addressing below.
692            //
693            // We don't have to worry about emitting islands, because the only label-use type has a
694            // maximum range of 2 GB. If we later consider using shorter-range label references,
695            // this will need to be revisited.
696
697            // We generate the following sequence. Note that the only read of %idx is before the
698            // write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
699            // if you change this.
700            // lea start_of_jump_table_offset(%rip), %tmp1
701            // movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
702            // addq %tmp2, %tmp1
703            // j *%tmp1
704            // $start_of_jump_table:
705            // -- jump table entries
706
707            // Load base address of jump table.
708            let start_of_jumptable = sink.get_label();
709            asm::inst::leaq_rm::new(tmp1, Amode::rip_relative(start_of_jumptable))
710                .emit(sink, info, state);
711
712            // Load value out of the jump table. It's a relative offset to the target block, so it
713            // might be negative; use a sign-extension.
714            let inst = Inst::movsx_rm_r(
715                ExtMode::LQ,
716                RegMem::mem(Amode::imm_reg_reg_shift(
717                    0,
718                    Gpr::unwrap_new(tmp1.to_reg()),
719                    Gpr::unwrap_new(idx),
720                    2,
721                )),
722                tmp2,
723            );
724            inst.emit(sink, info, state);
725
726            // Add base of jump table to jump-table-sourced block offset.
727            asm::inst::addq_rm::new(tmp1, tmp2).emit(sink, info, state);
728
729            // Branch to computed address.
730            asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
731
732            // Emit jump table (table of 32-bit offsets).
733            sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());
734            let jt_off = sink.cur_offset();
735            for &target in targets.iter().chain(std::iter::once(default_target)) {
736                let word_off = sink.cur_offset();
737                // off_into_table is an addend here embedded in the label to be later patched at
738                // the end of codegen. The offset is initially relative to this jump table entry;
739                // with the extra addend, it'll be relative to the jump table's start, after
740                // patching.
741                let off_into_table = word_off - jt_off;
742                sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
743                sink.put4(off_into_table);
744            }
745        }
746
747        Inst::TrapIf { cc, trap_code } => {
748            let trap_label = sink.defer_trap(*trap_code);
749            one_way_jmp(sink, *cc, trap_label);
750        }
751
752        Inst::TrapIfAnd {
753            cc1,
754            cc2,
755            trap_code,
756        } => {
757            let trap_label = sink.defer_trap(*trap_code);
758            let else_label = sink.get_label();
759
760            // Jump to the end if the first condition isn't true, and then if
761            // the second condition is true go to the trap.
762            one_way_jmp(sink, cc1.invert(), else_label);
763            one_way_jmp(sink, *cc2, trap_label);
764
765            sink.bind_label(else_label, state.ctrl_plane_mut());
766        }
767
768        Inst::TrapIfOr {
769            cc1,
770            cc2,
771            trap_code,
772        } => {
773            let trap_label = sink.defer_trap(*trap_code);
774
775            // Emit two jumps to the same trap if either condition code is true.
776            one_way_jmp(sink, *cc1, trap_label);
777            one_way_jmp(sink, *cc2, trap_label);
778        }
779
780        Inst::XmmMinMaxSeq {
781            size,
782            is_min,
783            lhs,
784            rhs,
785            dst,
786        } => {
787            let rhs = rhs.to_reg();
788            let lhs = lhs.to_reg();
789            let dst = dst.to_writable_reg();
790            debug_assert_eq!(rhs, dst.to_reg());
791
792            // Generates the following sequence:
793            // cmpss/cmpsd %lhs, %rhs_dst
794            // jnz do_min_max
795            // jp propagate_nan
796            //
797            // ;; ordered and equal: propagate the sign bit (for -0 vs 0):
798            // {and,or}{ss,sd} %lhs, %rhs_dst
799            // j done
800            //
801            // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
802            // ;; NaN value is returned), we add both inputs.
803            // propagate_nan:
804            // add{ss,sd} %lhs, %rhs_dst
805            // j done
806            //
807            // do_min_max:
808            // {min,max}{ss,sd} %lhs, %rhs_dst
809            //
810            // done:
811            let done = sink.get_label();
812            let propagate_nan = sink.get_label();
813            let do_min_max = sink.get_label();
814
815            let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
816                OperandSize::Size32 => (
817                    asm::inst::addss_a::new(dst, lhs).into(),
818                    asm::inst::ucomiss_a::new(dst.to_reg(), lhs).into(),
819                    asm::inst::andps_a::new(dst, lhs).into(),
820                    asm::inst::orps_a::new(dst, lhs).into(),
821                    if *is_min {
822                        asm::inst::minss_a::new(dst, lhs).into()
823                    } else {
824                        asm::inst::maxss_a::new(dst, lhs).into()
825                    },
826                ),
827                OperandSize::Size64 => (
828                    asm::inst::addsd_a::new(dst, lhs).into(),
829                    asm::inst::ucomisd_a::new(dst.to_reg(), lhs).into(),
830                    asm::inst::andpd_a::new(dst, lhs).into(),
831                    asm::inst::orpd_a::new(dst, lhs).into(),
832                    if *is_min {
833                        asm::inst::minsd_a::new(dst, lhs).into()
834                    } else {
835                        asm::inst::maxsd_a::new(dst, lhs).into()
836                    },
837                ),
838                _ => unreachable!(),
839            };
840            let add_op: AsmInst = add_op;
841            let or_op: AsmInst = or_op;
842            let min_max_op: AsmInst = min_max_op;
843            let cmp_op: AsmInst = cmp_op;
844
845            cmp_op.emit(sink, info, state);
846
847            one_way_jmp(sink, CC::NZ, do_min_max);
848            one_way_jmp(sink, CC::P, propagate_nan);
849
850            // Ordered and equal. The operands are bit-identical unless they are zero
851            // and negative zero. These instructions merge the sign bits in that
852            // case, and are no-ops otherwise.
853            let inst: AsmInst = if *is_min { or_op } else { and_op };
854            inst.emit(sink, info, state);
855
856            let inst = Inst::jmp_known(done);
857            inst.emit(sink, info, state);
858
859            // x86's min/max are not symmetric; if either operand is a NaN, they return the
860            // read-only operand: perform an addition between the two operands, which has the
861            // desired NaN propagation effects.
862            sink.bind_label(propagate_nan, state.ctrl_plane_mut());
863            add_op.emit(sink, info, state);
864
865            one_way_jmp(sink, CC::P, done);
866
867            sink.bind_label(do_min_max, state.ctrl_plane_mut());
868            min_max_op.emit(sink, info, state);
869
870            sink.bind_label(done, state.ctrl_plane_mut());
871        }
872
873        Inst::XmmUninitializedValue { .. } | Inst::GprUninitializedValue { .. } => {
874            // These instruction formats only exist to declare a register as a
875            // `def`; no code is emitted. This is always immediately followed by
876            // an instruction, such as `xor <tmp>, <tmp>`, that semantically
877            // reads this undefined value but arithmetically produces the same
878            // result regardless of its value.
879        }
880
881        Inst::CvtUint64ToFloatSeq {
882            dst_size,
883            src,
884            dst,
885            tmp_gpr1,
886            tmp_gpr2,
887        } => {
888            let src = src.to_reg();
889            let dst = dst.to_writable_reg();
890            let tmp_gpr1 = tmp_gpr1.to_writable_reg();
891            let tmp_gpr2 = tmp_gpr2.to_writable_reg();
892
893            // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
894            // different sequence.
895            //
896            // Emit the following sequence:
897            //
898            //  cmp 0, %src
899            //  jl handle_negative
900            //
901            //  ;; handle positive, which can't overflow
902            //  cvtsi2sd/cvtsi2ss %src, %dst
903            //  j done
904            //
905            //  ;; handle negative: see below for an explanation of what it's doing.
906            //  handle_negative:
907            //  mov %src, %tmp_gpr1
908            //  shr $1, %tmp_gpr1
909            //  mov %src, %tmp_gpr2
910            //  and $1, %tmp_gpr2
911            //  or %tmp_gpr1, %tmp_gpr2
912            //  cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
913            //  addsd/addss %dst, %dst
914            //
915            //  done:
916
917            assert_ne!(src, tmp_gpr1.to_reg());
918            assert_ne!(src, tmp_gpr2.to_reg());
919
920            let handle_negative = sink.get_label();
921            let done = sink.get_label();
922
923            // If x seen as a signed int64 is not negative, a signed-conversion will do the right
924            // thing.
925            // TODO use tst src, src here.
926            asm::inst::cmpq_mi_sxb::new(src, 0).emit(sink, info, state);
927
928            one_way_jmp(sink, CC::L, handle_negative);
929
930            // Handle a positive int64, which is the "easy" case: a signed conversion will do the
931            // right thing.
932            emit_signed_cvt(
933                sink,
934                info,
935                state,
936                src,
937                dst,
938                *dst_size == OperandSize::Size64,
939            );
940
941            let inst = Inst::jmp_known(done);
942            inst.emit(sink, info, state);
943
944            sink.bind_label(handle_negative, state.ctrl_plane_mut());
945
946            // Divide x by two to get it in range for the signed conversion, keep the LSB, and
947            // scale it back up on the FP side.
948            let inst = Inst::gen_move(tmp_gpr1, src, types::I64);
949            inst.emit(sink, info, state);
950
951            // tmp_gpr1 := src >> 1
952            asm::inst::shrq_mi::new(tmp_gpr1, 1).emit(sink, info, state);
953
954            let inst = Inst::gen_move(tmp_gpr2, src, types::I64);
955            inst.emit(sink, info, state);
956
957            asm::inst::andq_mi_sxb::new(tmp_gpr2, 1).emit(sink, info, state);
958
959            asm::inst::orq_rm::new(tmp_gpr2, tmp_gpr1).emit(sink, info, state);
960
961            emit_signed_cvt(
962                sink,
963                info,
964                state,
965                tmp_gpr2.to_reg(),
966                dst,
967                *dst_size == OperandSize::Size64,
968            );
969
970            let inst: AsmInst = match *dst_size {
971                OperandSize::Size64 => asm::inst::addsd_a::new(dst, dst.to_reg()).into(),
972                OperandSize::Size32 => asm::inst::addss_a::new(dst, dst.to_reg()).into(),
973                _ => unreachable!(),
974            };
975            inst.emit(sink, info, state);
976
977            sink.bind_label(done, state.ctrl_plane_mut());
978        }
979
980        Inst::CvtFloatToSintSeq {
981            src_size,
982            dst_size,
983            is_saturating,
984            src,
985            dst,
986            tmp_gpr,
987            tmp_xmm,
988        } => {
989            use OperandSize::*;
990
991            let src = src.to_reg();
992            let dst = dst.to_writable_reg();
993            let tmp_gpr = tmp_gpr.to_writable_reg();
994            let tmp_xmm = tmp_xmm.to_writable_reg();
995
996            // Emits the following common sequence:
997            //
998            // cvttss2si/cvttsd2si %src, %dst
999            // cmp %dst, 1
1000            // jno done
1001            //
1002            // Then, for saturating conversions:
1003            //
1004            // ;; check for NaN
1005            // cmpss/cmpsd %src, %src
1006            // jnp not_nan
1007            // xor %dst, %dst
1008            //
1009            // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
1010            // ;; already in %dst.
1011            // xorpd %tmp_xmm, %tmp_xmm
1012            // cmpss/cmpsd %src, %tmp_xmm
1013            // jnb done
1014            // mov/movaps $INT_MAX, %dst
1015            //
1016            // done:
1017            //
1018            // Then, for non-saturating conversions:
1019            //
1020            // ;; check for NaN
1021            // cmpss/cmpsd %src, %src
1022            // jnp not_nan
1023            // ud2 trap BadConversionToInteger
1024            //
1025            // ;; check if INT_MIN was the correct result, against a magic constant:
1026            // not_nan:
1027            // movaps/mov $magic, %tmp_gpr
1028            // movq/movd %tmp_gpr, %tmp_xmm
1029            // cmpss/cmpsd %tmp_xmm, %src
1030            // jnb/jnbe $check_positive
1031            // ud2 trap IntegerOverflow
1032            //
1033            // ;; if positive, it was a real overflow
1034            // check_positive:
1035            // xorpd %tmp_xmm, %tmp_xmm
1036            // cmpss/cmpsd %src, %tmp_xmm
1037            // jnb done
1038            // ud2 trap IntegerOverflow
1039            //
1040            // done:
1041
1042            let cmp_op: AsmInst = match src_size {
1043                Size64 => asm::inst::ucomisd_a::new(src, src).into(),
1044                Size32 => asm::inst::ucomiss_a::new(src, src).into(),
1045                _ => unreachable!(),
1046            };
1047
1048            let cvtt_op = |dst, src| Inst::External {
1049                inst: match (*src_size, *dst_size) {
1050                    (Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1051                    (Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1052                    (Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1053                    (Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1054                    _ => unreachable!(),
1055                },
1056            };
1057
1058            let done = sink.get_label();
1059
1060            // The truncation.
1061            cvtt_op(dst, src).emit(sink, info, state);
1062
1063            // Compare against 1, in case of overflow the dst operand was INT_MIN.
1064            let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 1);
1065            inst.emit(sink, info, state);
1066
1067            one_way_jmp(sink, CC::NO, done); // no overflow => done
1068
1069            // Check for NaN.
1070            cmp_op.emit(sink, info, state);
1071
1072            if *is_saturating {
1073                let not_nan = sink.get_label();
1074                one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
1075
1076                // For NaN, emit 0.
1077                let inst: AsmInst = match *dst_size {
1078                    OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1079                    OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1080                    _ => unreachable!(),
1081                };
1082                inst.emit(sink, info, state);
1083
1084                let inst = Inst::jmp_known(done);
1085                inst.emit(sink, info, state);
1086
1087                sink.bind_label(not_nan, state.ctrl_plane_mut());
1088
1089                // If the input was positive, saturate to INT_MAX.
1090
1091                // Zero out tmp_xmm.
1092                asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1093
1094                let inst: AsmInst = match src_size {
1095                    Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1096                    Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1097                    _ => unreachable!(),
1098                };
1099                inst.emit(sink, info, state);
1100
1101                // Jump if >= to done.
1102                one_way_jmp(sink, CC::NB, done);
1103
1104                // Otherwise, put INT_MAX.
1105                if *dst_size == OperandSize::Size64 {
1106                    let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, dst);
1107                    inst.emit(sink, info, state);
1108                } else {
1109                    let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, dst);
1110                    inst.emit(sink, info, state);
1111                }
1112            } else {
1113                let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1114                inst.emit(sink, info, state);
1115
1116                // Check if INT_MIN was the correct result: determine the smallest floating point
1117                // number that would convert to INT_MIN, put it in a temporary register, and compare
1118                // against the src register.
1119                // If the src register is less (or in some cases, less-or-equal) than the threshold,
1120                // trap!
1121
1122                let mut no_overflow_cc = CC::NB; // >=
1123                let output_bits = dst_size.to_bits();
1124                match *src_size {
1125                    OperandSize::Size32 => {
1126                        let cst = (-Ieee32::pow2(output_bits - 1)).bits();
1127                        let inst = Inst::imm(OperandSize::Size32, cst as u64, tmp_gpr);
1128                        inst.emit(sink, info, state);
1129                    }
1130                    OperandSize::Size64 => {
1131                        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
1132                        // so there are values less than -2^(N-1) that convert correctly to INT_MIN.
1133                        let cst = if output_bits < 64 {
1134                            no_overflow_cc = CC::NBE; // >
1135                            Ieee64::fcvt_to_sint_negative_overflow(output_bits)
1136                        } else {
1137                            -Ieee64::pow2(output_bits - 1)
1138                        };
1139                        let inst = Inst::imm(OperandSize::Size64, cst.bits(), tmp_gpr);
1140                        inst.emit(sink, info, state);
1141                    }
1142                    _ => unreachable!(),
1143                }
1144
1145                let inst: AsmInst = {
1146                    let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1147                    match src_size {
1148                        Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1149                        Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1150                        _ => unreachable!(),
1151                    }
1152                };
1153                inst.emit(sink, info, state);
1154
1155                let inst: AsmInst = match src_size {
1156                    Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1157                    Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1158                    _ => unreachable!(),
1159                };
1160                inst.emit(sink, info, state);
1161
1162                // no trap if src >= or > threshold
1163                let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::INTEGER_OVERFLOW);
1164                inst.emit(sink, info, state);
1165
1166                // If positive, it was a real overflow.
1167
1168                // Zero out the tmp_xmm register.
1169                asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1170
1171                let inst: AsmInst = match src_size {
1172                    Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1173                    Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1174                    _ => unreachable!(),
1175                };
1176                inst.emit(sink, info, state);
1177
1178                // no trap if 0 >= src
1179                let inst = Inst::trap_if(CC::B, TrapCode::INTEGER_OVERFLOW);
1180                inst.emit(sink, info, state);
1181            }
1182
1183            sink.bind_label(done, state.ctrl_plane_mut());
1184        }
1185
1186        Inst::CvtFloatToUintSeq {
1187            src_size,
1188            dst_size,
1189            is_saturating,
1190            src,
1191            dst,
1192            tmp_gpr,
1193            tmp_xmm,
1194            tmp_xmm2,
1195        } => {
1196            use OperandSize::*;
1197
1198            let src = src.to_reg();
1199            let dst = dst.to_writable_reg();
1200            let tmp_gpr = tmp_gpr.to_writable_reg();
1201            let tmp_xmm = tmp_xmm.to_writable_reg();
1202            let tmp_xmm2 = tmp_xmm2.to_writable_reg();
1203
1204            // The only difference in behavior between saturating and non-saturating is how we
1205            // handle errors. Emits the following sequence:
1206            //
1207            // movaps/mov 2**(int_width - 1), %tmp_gpr
1208            // movq/movd %tmp_gpr, %tmp_xmm
1209            // cmpss/cmpsd %tmp_xmm, %src
1210            // jnb is_large
1211            //
1212            // ;; check for NaN inputs
1213            // jnp not_nan
1214            // -- non-saturating: ud2 trap BadConversionToInteger
1215            // -- saturating: xor %dst, %dst; j done
1216            //
1217            // not_nan:
1218            // cvttss2si/cvttsd2si %src, %dst
1219            // cmp 0, %dst
1220            // jnl done
1221            // -- non-saturating: ud2 trap IntegerOverflow
1222            // -- saturating: xor %dst, %dst; j done
1223            //
1224            // is_large:
1225            // mov %src, %tmp_xmm2
1226            // subss/subsd %tmp_xmm, %tmp_xmm2
1227            // cvttss2si/cvttss2sd %tmp_x, %dst
1228            // cmp 0, %dst
1229            // jnl next_is_large
1230            // -- non-saturating: ud2 trap IntegerOverflow
1231            // -- saturating: movaps $UINT_MAX, %dst; j done
1232            //
1233            // next_is_large:
1234            // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
1235            //
1236            // done:
1237
1238            assert_ne!(tmp_xmm.to_reg(), src, "tmp_xmm clobbers src!");
1239
1240            let xor_op = |dst, src| Inst::External {
1241                inst: match *dst_size {
1242                    Size32 => asm::inst::xorl_rm::new(dst, src).into(),
1243                    Size64 => asm::inst::xorq_rm::new(dst, src).into(),
1244                    _ => unreachable!(),
1245                },
1246            };
1247
1248            let subs_op = |dst, src| Inst::External {
1249                inst: match *src_size {
1250                    Size32 => asm::inst::subss_a::new(dst, src).into(),
1251                    Size64 => asm::inst::subsd_a::new(dst, src).into(),
1252                    _ => unreachable!(),
1253                },
1254            };
1255
1256            let cvtt_op = |dst, src| Inst::External {
1257                inst: match (*src_size, *dst_size) {
1258                    (Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1259                    (Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1260                    (Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1261                    (Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1262                    _ => unreachable!(),
1263                },
1264            };
1265
1266            let done = sink.get_label();
1267
1268            let cst = match src_size {
1269                OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,
1270                OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),
1271                _ => unreachable!(),
1272            };
1273
1274            let inst = Inst::imm(*src_size, cst, tmp_gpr);
1275            inst.emit(sink, info, state);
1276
1277            let inst: AsmInst = {
1278                let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1279                match src_size {
1280                    Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1281                    Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1282                    _ => unreachable!(),
1283                }
1284            };
1285            inst.emit(sink, info, state);
1286
1287            let inst: AsmInst = match src_size {
1288                Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1289                Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1290                _ => unreachable!(),
1291            };
1292            inst.emit(sink, info, state);
1293
1294            let handle_large = sink.get_label();
1295            one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
1296
1297            if *is_saturating {
1298                // If not NaN jump over this 0-return, otherwise return 0
1299                let not_nan = sink.get_label();
1300                one_way_jmp(sink, CC::NP, not_nan);
1301
1302                xor_op(dst, dst).emit(sink, info, state);
1303
1304                let inst = Inst::jmp_known(done);
1305                inst.emit(sink, info, state);
1306                sink.bind_label(not_nan, state.ctrl_plane_mut());
1307            } else {
1308                // Trap.
1309                let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1310                inst.emit(sink, info, state);
1311            }
1312
1313            // Actual truncation for small inputs: if the result is not positive, then we had an
1314            // overflow.
1315
1316            cvtt_op(dst, src).emit(sink, info, state);
1317
1318            let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1319            inst.emit(sink, info, state);
1320
1321            one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
1322
1323            if *is_saturating {
1324                // The input was "small" (< 2**(width -1)), so the only way to get an integer
1325                // overflow is because the input was too small: saturate to the min value, i.e. 0.
1326                let inst: AsmInst = match *dst_size {
1327                    OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1328                    OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1329                    _ => unreachable!(),
1330                };
1331                inst.emit(sink, info, state);
1332
1333                let inst = Inst::jmp_known(done);
1334                inst.emit(sink, info, state);
1335            } else {
1336                // Trap.
1337                asm::inst::ud2_zo::new(TrapCode::INTEGER_OVERFLOW).emit(sink, info, state);
1338            }
1339
1340            // Now handle large inputs.
1341
1342            sink.bind_label(handle_large, state.ctrl_plane_mut());
1343
1344            let inst = Inst::gen_move(tmp_xmm2, src, types::F64);
1345            inst.emit(sink, info, state);
1346
1347            subs_op(tmp_xmm2, tmp_xmm.to_reg()).emit(sink, info, state);
1348
1349            cvtt_op(dst, tmp_xmm2.to_reg()).emit(sink, info, state);
1350
1351            let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1352            inst.emit(sink, info, state);
1353
1354            if *is_saturating {
1355                let next_is_large = sink.get_label();
1356                one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
1357
1358                // The input was "large" (>= 2**(width -1)), so the only way to get an integer
1359                // overflow is because the input was too large: saturate to the max value.
1360                let inst = Inst::imm(
1361                    OperandSize::Size64,
1362                    if *dst_size == OperandSize::Size64 {
1363                        u64::max_value()
1364                    } else {
1365                        u32::max_value() as u64
1366                    },
1367                    dst,
1368                );
1369                inst.emit(sink, info, state);
1370
1371                let inst = Inst::jmp_known(done);
1372                inst.emit(sink, info, state);
1373                sink.bind_label(next_is_large, state.ctrl_plane_mut());
1374            } else {
1375                let inst = Inst::trap_if(CC::L, TrapCode::INTEGER_OVERFLOW);
1376                inst.emit(sink, info, state);
1377            }
1378
1379            if *dst_size == OperandSize::Size64 {
1380                let inst = Inst::imm(OperandSize::Size64, 1 << 63, tmp_gpr);
1381                inst.emit(sink, info, state);
1382
1383                asm::inst::addq_rm::new(dst, tmp_gpr).emit(sink, info, state);
1384            } else {
1385                asm::inst::addl_mi::new(dst, asm::Imm32::new(1 << 31)).emit(sink, info, state);
1386            }
1387
1388            sink.bind_label(done, state.ctrl_plane_mut());
1389        }
1390
1391        Inst::LoadExtName {
1392            dst,
1393            name,
1394            offset,
1395            distance,
1396        } => {
1397            let name = &**name;
1398            let riprel = asm::Amode::RipRelative {
1399                target: asm::DeferredTarget::None,
1400            };
1401            if info.flags.is_pic() {
1402                // Generates: movq symbol@GOTPCREL(%rip), %dst
1403                asm::inst::movq_rm::new(*dst, riprel).emit(sink, info, state);
1404                let cur = sink.cur_offset();
1405                sink.add_reloc_at_offset(cur - 4, Reloc::X86GOTPCRel4, name, -4);
1406
1407                // Offset in the relocation above applies to the address of the
1408                // *GOT entry*, not the loaded address; so we emit a separate
1409                // add instruction if needed.
1410                let offset = i32::try_from(*offset).unwrap();
1411                if offset != 0 {
1412                    asm::inst::addq_mi_sxl::new(PairedGpr::from(*dst), offset)
1413                        .emit(sink, info, state);
1414                }
1415            } else if distance == &RelocDistance::Near {
1416                // If we know the distance to the name is within 2GB (e.g., a
1417                // module-local function), we can generate a RIP-relative
1418                // address, with a relocation.
1419                asm::inst::leaq_rm::new(*dst, riprel).emit(sink, info, state);
1420                let cur = sink.cur_offset();
1421                sink.add_reloc_at_offset(cur - 4, Reloc::X86CallPCRel4, name, *offset - 4);
1422            } else {
1423                // The full address can be encoded in the register, with a
1424                // relocation.
1425                asm::inst::movabsq_oi::new(*dst, 0).emit(sink, info, state);
1426                let cur = sink.cur_offset();
1427                sink.add_reloc_at_offset(cur - 8, Reloc::Abs8, name, *offset);
1428            }
1429        }
1430
1431        Inst::AtomicRmwSeq {
1432            ty,
1433            op,
1434            mem,
1435            operand,
1436            temp,
1437            dst_old,
1438        } => {
1439            let operand = *operand;
1440            let temp = *temp;
1441            let temp_r = temp.map(|r| *r);
1442            let dst_old = *dst_old;
1443            let dst_old_r = dst_old.map(|r| *r);
1444            debug_assert_eq!(dst_old.to_reg(), regs::rax());
1445            let mem = mem.finalize(state.frame_layout(), sink).clone();
1446
1447            // Emit this:
1448            //    mov{zbq,zwq,zlq,q}     (%r_address), %rax    // rax = old value
1449            //  again:
1450            //    movq                   %rax, %r_temp         // rax = old value, r_temp = old value
1451            //    `op`q                  %r_operand, %r_temp   // rax = old value, r_temp = new value
1452            //    lock cmpxchg{b,w,l,q}  %r_temp, (%r_address) // try to store new value
1453            //    jnz again // If this is taken, rax will have a "revised" old value
1454            //
1455            // Operand conventions: IN:  %r_address, %r_operand OUT: %rax (old
1456            //    value), %r_temp (trashed), %rflags (trashed)
1457            let again_label = sink.get_label();
1458
1459            // mov{zbq,zwq,zlq,q} (%r_address), %rax
1460            // No need to call `add_trap` here, since the `i1` emit will do that.
1461            let i1 = Inst::load(*ty, mem.clone(), dst_old_r, ExtKind::ZeroExtend);
1462            i1.emit(sink, info, state);
1463
1464            // again:
1465            sink.bind_label(again_label, state.ctrl_plane_mut());
1466
1467            // movq %rax, %r_temp
1468            asm::inst::movq_mr::new(temp, dst_old.to_reg()).emit(sink, info, state);
1469
1470            use AtomicRmwSeqOp as RmwOp;
1471            match op {
1472                RmwOp::Nand => {
1473                    // andq %r_operand, %r_temp
1474                    asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1475
1476                    // notq %r_temp
1477                    asm::inst::notq_m::new(PairedGpr::from(temp)).emit(sink, info, state);
1478                }
1479                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1480                    // cmp %r_temp, %r_operand
1481                    let temp = temp.to_reg();
1482                    match *ty {
1483                        types::I8 => asm::inst::cmpb_mr::new(operand, temp).emit(sink, info, state),
1484                        types::I16 => {
1485                            asm::inst::cmpw_mr::new(operand, temp).emit(sink, info, state)
1486                        }
1487                        types::I32 => {
1488                            asm::inst::cmpl_mr::new(operand, temp).emit(sink, info, state)
1489                        }
1490                        types::I64 => {
1491                            asm::inst::cmpq_mr::new(operand, temp).emit(sink, info, state)
1492                        }
1493                        _ => unreachable!(),
1494                    }
1495
1496                    // cmovcc %r_operand, %r_temp
1497                    match op {
1498                        RmwOp::Umin => {
1499                            asm::inst::cmovbeq_rm::new(temp_r, *operand).emit(sink, info, state)
1500                        }
1501                        RmwOp::Umax => {
1502                            asm::inst::cmovaeq_rm::new(temp_r, *operand).emit(sink, info, state)
1503                        }
1504                        RmwOp::Smin => {
1505                            asm::inst::cmovleq_rm::new(temp_r, *operand).emit(sink, info, state)
1506                        }
1507                        RmwOp::Smax => {
1508                            asm::inst::cmovgeq_rm::new(temp_r, *operand).emit(sink, info, state)
1509                        }
1510                        _ => unreachable!(),
1511                    }
1512                }
1513                RmwOp::And => {
1514                    // andq %r_operand, %r_temp
1515                    asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1516                }
1517                RmwOp::Or => {
1518                    // orq %r_operand, %r_temp
1519                    asm::inst::orq_rm::new(temp, operand).emit(sink, info, state);
1520                }
1521                RmwOp::Xor => {
1522                    // xorq %r_operand, %r_temp
1523                    asm::inst::xorq_rm::new(temp, operand).emit(sink, info, state);
1524                }
1525            }
1526
1527            // lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
1528            // No need to call `add_trap` here, since the `i4` emit will do that.
1529            let temp = temp.to_reg();
1530            let dst_old = PairedGpr::from(dst_old);
1531            let inst: AsmInst = match *ty {
1532                types::I8 => asm::inst::lock_cmpxchgb_mr::new(mem, temp, dst_old).into(),
1533                types::I16 => asm::inst::lock_cmpxchgw_mr::new(mem, temp, dst_old).into(),
1534                types::I32 => asm::inst::lock_cmpxchgl_mr::new(mem, temp, dst_old).into(),
1535                types::I64 => asm::inst::lock_cmpxchgq_mr::new(mem, temp, dst_old).into(),
1536                _ => unreachable!(),
1537            };
1538            inst.emit(sink, info, state);
1539
1540            // jnz again
1541            one_way_jmp(sink, CC::NZ, again_label);
1542        }
1543
1544        Inst::Atomic128RmwSeq {
1545            op,
1546            mem,
1547            operand_low,
1548            operand_high,
1549            temp_low,
1550            temp_high,
1551            dst_old_low,
1552            dst_old_high,
1553        } => {
1554            let operand_low = *operand_low;
1555            let operand_high = *operand_high;
1556            let temp_low = *temp_low;
1557            let temp_high = *temp_high;
1558            let dst_old_low = *dst_old_low;
1559            let dst_old_high = *dst_old_high;
1560            debug_assert_eq!(temp_low.to_reg(), regs::rbx());
1561            debug_assert_eq!(temp_high.to_reg(), regs::rcx());
1562            debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1563            debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1564            let mem = mem.finalize(state.frame_layout(), sink).clone();
1565
1566            let again_label = sink.get_label();
1567
1568            // Load the initial value.
1569            asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1570            asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1571
1572            // again:
1573            sink.bind_label(again_label, state.ctrl_plane_mut());
1574
1575            // Move old value to temp registers.
1576            asm::inst::movq_mr::new(temp_low, dst_old_low.to_reg()).emit(sink, info, state);
1577            asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg()).emit(sink, info, state);
1578
1579            // Perform the operation.
1580            use Atomic128RmwSeqOp as RmwOp;
1581            match op {
1582                RmwOp::Nand => {
1583                    // temp &= operand
1584                    asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1585                    asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1586
1587                    // temp = !temp
1588                    asm::inst::notq_m::new(PairedGpr::from(temp_low)).emit(sink, info, state);
1589                    asm::inst::notq_m::new(PairedGpr::from(temp_high)).emit(sink, info, state);
1590                }
1591                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1592                    // Do a comparison with LHS temp and RHS operand.
1593                    // Note the opposite argument orders.
1594                    asm::inst::cmpq_mr::new(temp_low.to_reg(), operand_low).emit(sink, info, state);
1595                    // This will clobber `temp_high`
1596                    asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1597                    // Restore the clobbered value
1598                    asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg())
1599                        .emit(sink, info, state);
1600                    match op {
1601                        RmwOp::Umin => {
1602                            asm::inst::cmovaeq_rm::new(temp_low, operand_low)
1603                                .emit(sink, info, state);
1604                            asm::inst::cmovaeq_rm::new(temp_high, operand_high)
1605                                .emit(sink, info, state);
1606                        }
1607                        RmwOp::Umax => {
1608                            asm::inst::cmovbq_rm::new(temp_low, operand_low)
1609                                .emit(sink, info, state);
1610                            asm::inst::cmovbq_rm::new(temp_high, operand_high)
1611                                .emit(sink, info, state);
1612                        }
1613                        RmwOp::Smin => {
1614                            asm::inst::cmovgeq_rm::new(temp_low, operand_low)
1615                                .emit(sink, info, state);
1616                            asm::inst::cmovgeq_rm::new(temp_high, operand_high)
1617                                .emit(sink, info, state);
1618                        }
1619                        RmwOp::Smax => {
1620                            asm::inst::cmovlq_rm::new(temp_low, operand_low)
1621                                .emit(sink, info, state);
1622                            asm::inst::cmovlq_rm::new(temp_high, operand_high)
1623                                .emit(sink, info, state);
1624                        }
1625                        _ => unreachable!(),
1626                    }
1627                }
1628                RmwOp::Add => {
1629                    asm::inst::addq_rm::new(temp_low, operand_low).emit(sink, info, state);
1630                    asm::inst::adcq_rm::new(temp_high, operand_high).emit(sink, info, state);
1631                }
1632                RmwOp::Sub => {
1633                    asm::inst::subq_rm::new(temp_low, operand_low).emit(sink, info, state);
1634                    asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1635                }
1636                RmwOp::And => {
1637                    asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1638                    asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1639                }
1640                RmwOp::Or => {
1641                    asm::inst::orq_rm::new(temp_low, operand_low).emit(sink, info, state);
1642                    asm::inst::orq_rm::new(temp_high, operand_high).emit(sink, info, state);
1643                }
1644                RmwOp::Xor => {
1645                    asm::inst::xorq_rm::new(temp_low, operand_low).emit(sink, info, state);
1646                    asm::inst::xorq_rm::new(temp_high, operand_high).emit(sink, info, state);
1647                }
1648            }
1649
1650            // cmpxchg16b (mem)
1651            asm::inst::lock_cmpxchg16b_m::new(
1652                PairedGpr::from(dst_old_low),
1653                PairedGpr::from(dst_old_high),
1654                temp_low.to_reg(),
1655                temp_high.to_reg(),
1656                mem,
1657            )
1658            .emit(sink, info, state);
1659
1660            // jnz again
1661            one_way_jmp(sink, CC::NZ, again_label);
1662        }
1663
1664        Inst::Atomic128XchgSeq {
1665            mem,
1666            operand_low,
1667            operand_high,
1668            dst_old_low,
1669            dst_old_high,
1670        } => {
1671            let operand_low = *operand_low;
1672            let operand_high = *operand_high;
1673            let dst_old_low = *dst_old_low;
1674            let dst_old_high = *dst_old_high;
1675            debug_assert_eq!(operand_low, regs::rbx());
1676            debug_assert_eq!(operand_high, regs::rcx());
1677            debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1678            debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1679            let mem = mem.finalize(state.frame_layout(), sink).clone();
1680
1681            let again_label = sink.get_label();
1682
1683            // Load the initial value.
1684            asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1685            asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1686
1687            // again:
1688            sink.bind_label(again_label, state.ctrl_plane_mut());
1689
1690            // cmpxchg16b (mem)
1691            asm::inst::lock_cmpxchg16b_m::new(
1692                PairedGpr::from(dst_old_low),
1693                PairedGpr::from(dst_old_high),
1694                operand_low,
1695                operand_high,
1696                mem,
1697            )
1698            .emit(sink, info, state);
1699
1700            // jnz again
1701            one_way_jmp(sink, CC::NZ, again_label);
1702        }
1703
1704        Inst::ElfTlsGetAddr { symbol, dst } => {
1705            let dst = dst.to_reg().to_reg();
1706            debug_assert_eq!(dst, regs::rax());
1707
1708            // N.B.: Must be exactly this byte sequence; the linker requires it,
1709            // because it must know how to rewrite the bytes.
1710
1711            // data16 lea gv@tlsgd(%rip),%rdi
1712            sink.put1(0x66); // data16
1713            sink.put1(0b01001000); // REX.W
1714            sink.put1(0x8d); // LEA
1715            sink.put1(0x3d); // ModRM byte
1716            emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);
1717            sink.put4(0); // offset
1718
1719            // data16 data16 callq __tls_get_addr-4
1720            sink.put1(0x66); // data16
1721            sink.put1(0x66); // data16
1722            sink.put1(0b01001000); // REX.W
1723            sink.put1(0xe8); // CALL
1724            emit_reloc(
1725                sink,
1726                Reloc::X86CallPLTRel4,
1727                &ExternalName::LibCall(LibCall::ElfTlsGetAddr),
1728                -4,
1729            );
1730            sink.put4(0); // offset
1731        }
1732
1733        Inst::MachOTlsGetAddr { symbol, dst } => {
1734            let dst = dst.to_reg().to_reg();
1735            debug_assert_eq!(dst, regs::rax());
1736
1737            // movq gv@tlv(%rip), %rdi
1738            sink.put1(0x48); // REX.w
1739            sink.put1(0x8b); // MOV
1740            sink.put1(0x3d); // ModRM byte
1741            emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);
1742            sink.put4(0); // offset
1743
1744            asm::inst::callq_m::new(asm::Amode::ImmReg {
1745                base: Gpr::RDI,
1746                simm32: asm::AmodeOffsetPlusKnownOffset::ZERO,
1747                trap: None,
1748            })
1749            .emit(sink, info, state);
1750        }
1751
1752        Inst::CoffTlsGetAddr { symbol, dst, tmp } => {
1753            let dst = dst.to_reg().to_reg();
1754            debug_assert_eq!(dst, regs::rax());
1755
1756            // tmp is used below directly as %rcx
1757            let tmp = tmp.to_reg().to_reg();
1758            debug_assert_eq!(tmp, regs::rcx());
1759
1760            // See: https://gcc.godbolt.org/z/M8or9x6ss
1761            // And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282
1762
1763            // Emit the following sequence
1764            // movl	(%rip), %eax          ; IMAGE_REL_AMD64_REL32	_tls_index
1765            // movq	%gs:88, %rcx
1766            // movq	(%rcx,%rax,8), %rax
1767            // leaq	(%rax), %rax          ; Reloc: IMAGE_REL_AMD64_SECREL	symbol
1768
1769            // Load TLS index for current thread
1770            // movl	(%rip), %eax
1771            sink.put1(0x8b); // mov
1772            sink.put1(0x05);
1773            emit_reloc(
1774                sink,
1775                Reloc::X86PCRel4,
1776                &ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),
1777                -4,
1778            );
1779            sink.put4(0); // offset
1780
1781            // movq	%gs:88, %rcx
1782            // Load the TLS Storage Array pointer
1783            // The gs segment register refers to the base address of the TEB on x64.
1784            // 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:
1785            sink.put_data(&[
1786                0x65, 0x48, // REX.W
1787                0x8b, // MOV
1788                0x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset
1789                0x00, 0x00, 0x00,
1790            ]);
1791
1792            // movq	(%rcx,%rax,8), %rax
1793            // Load the actual TLS entry for this thread.
1794            // Computes ThreadLocalStoragePointer + _tls_index*8
1795            sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);
1796
1797            // leaq	(%rax), %rax
1798            sink.put1(0x48);
1799            sink.put1(0x8d);
1800            sink.put1(0x80);
1801            emit_reloc(sink, Reloc::X86SecRel, symbol, 0);
1802            sink.put4(0); // offset
1803        }
1804
1805        Inst::Unwind { inst } => {
1806            sink.add_unwind(inst.clone());
1807        }
1808
1809        Inst::DummyUse { .. } => {
1810            // Nothing.
1811        }
1812
1813        Inst::LabelAddress { dst, label } => {
1814            // Emit an LEA with a LabelUse given this label.
1815            asm::inst::leaq_rm::new(*dst, Amode::rip_relative(*label)).emit(sink, info, state);
1816        }
1817
1818        Inst::External { inst } => {
1819            let frame = state.frame_layout();
1820            emit_maybe_shrink(
1821                inst,
1822                &mut external::AsmCodeSink {
1823                    sink,
1824
1825                    // These values are transcribed from what is happening in
1826                    // `SyntheticAmode::finalize`. This, plus the `Into` logic
1827                    // converting a `SyntheticAmode` to its external counterpart, are
1828                    // necessary to communicate Cranelift's internal offsets to the
1829                    // assembler; due to when Cranelift determines these offsets, this
1830                    // happens quite late (i.e., here during emission).
1831                    incoming_arg_offset: i32::try_from(
1832                        frame.tail_args_size + frame.setup_area_size,
1833                    )
1834                    .unwrap(),
1835                    slot_offset: i32::try_from(frame.outgoing_args_size).unwrap(),
1836                },
1837            );
1838        }
1839    }
1840
1841    state.clear_post_insn();
1842}
1843
1844/// Emit the common sequence used for both direct and indirect tail calls:
1845///
1846/// * Copy the new frame's stack arguments over the top of our current frame.
1847///
1848/// * Restore the old frame pointer.
1849///
1850/// * Initialize the tail callee's stack pointer (simultaneously deallocating
1851///   the temporary stack space we allocated when creating the new frame's stack
1852///   arguments).
1853///
1854/// * Move the return address into its stack slot.
1855fn emit_return_call_common_sequence<T>(
1856    sink: &mut MachBuffer<Inst>,
1857    info: &EmitInfo,
1858    state: &mut EmitState,
1859    call_info: &ReturnCallInfo<T>,
1860) {
1861    assert!(
1862        info.flags.preserve_frame_pointers(),
1863        "frame pointers aren't fundamentally required for tail calls, \
1864                 but the current implementation relies on them being present"
1865    );
1866
1867    let tmp = call_info.tmp.to_writable_reg();
1868
1869    for inst in
1870        X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())
1871    {
1872        inst.emit(sink, info, state);
1873    }
1874
1875    for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(
1876        CallConv::Tail,
1877        &info.flags,
1878        &info.isa_flags,
1879        state.frame_layout(),
1880    ) {
1881        inst.emit(sink, info, state);
1882    }
1883
1884    let incoming_args_diff = state.frame_layout().tail_args_size - call_info.new_stack_arg_size;
1885    if incoming_args_diff > 0 {
1886        // Move the saved return address up by `incoming_args_diff`.
1887        let addr = Amode::imm_reg(0, regs::rsp());
1888        asm::inst::movq_rm::new(tmp, addr).emit(sink, info, state);
1889        asm::inst::movq_mr::new(
1890            Amode::imm_reg(i32::try_from(incoming_args_diff).unwrap(), regs::rsp()),
1891            Gpr::unwrap_new(tmp.to_reg()),
1892        )
1893        .emit(sink, info, state);
1894
1895        // Increment the stack pointer to shrink the argument area for the new
1896        // call.
1897        let rsp = Writable::from_reg(regs::rsp());
1898        let incoming_args_diff = i32::try_from(incoming_args_diff)
1899            .expect("`incoming_args_diff` is too large to fit in a 32-bit signed immediate");
1900        Inst::addq_mi(rsp, incoming_args_diff).emit(sink, info, state);
1901    }
1902}
1903
1904/// Conveniene trait to have an `emit` method on all `asm::inst::*` variants.
1905trait ExternalEmit {
1906    fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState);
1907}
1908
1909impl<I> ExternalEmit for I
1910where
1911    I: Into<asm::inst::Inst<CraneliftRegisters>>,
1912{
1913    fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState) {
1914        Inst::External { inst: self.into() }.emit(sink, info, state)
1915    }
1916}
1917
1918/// Attempt to "shrink" the provided `inst`.
1919///
1920/// This function will inspect `inst` and attempt to return a new instruction
1921/// which is equivalent semantically but will encode to a smaller binary
1922/// representation. This is only done for instructions which require register
1923/// allocation to have already happened, for example shrinking immediates should
1924/// be done during instruction selection not at this point.
1925///
1926/// An example of this optimization is the `AND` instruction. The Intel manual
1927/// has a smaller encoding for `AND AL, imm8` than it does for `AND r/m8, imm8`.
1928/// Here the instructions are matched against and if regalloc state indicates
1929/// that a smaller variant is available then that's swapped to instead.
1930fn emit_maybe_shrink(inst: &AsmInst, sink: &mut impl asm::CodeSink) {
1931    use cranelift_assembler_x64::GprMem;
1932    use cranelift_assembler_x64::inst::*;
1933
1934    type R = CraneliftRegisters;
1935    const RAX: PairedGpr = PairedGpr {
1936        read: Gpr::RAX,
1937        write: Writable::from_reg(Gpr::RAX),
1938    };
1939    const RAX_RM: GprMem<PairedGpr, Gpr> = GprMem::Gpr(RAX);
1940
1941    match *inst {
1942        // and
1943        Inst::andb_mi(andb_mi { rm8: RAX_RM, imm8 }) => andb_i::<R>::new(RAX, imm8).encode(sink),
1944        Inst::andw_mi(andw_mi {
1945            rm16: RAX_RM,
1946            imm16,
1947        }) => andw_i::<R>::new(RAX, imm16).encode(sink),
1948        Inst::andl_mi(andl_mi {
1949            rm32: RAX_RM,
1950            imm32,
1951        }) => andl_i::<R>::new(RAX, imm32).encode(sink),
1952        Inst::andq_mi_sxl(andq_mi_sxl {
1953            rm64: RAX_RM,
1954            imm32,
1955        }) => andq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1956
1957        // or
1958        Inst::orb_mi(orb_mi { rm8: RAX_RM, imm8 }) => orb_i::<R>::new(RAX, imm8).encode(sink),
1959        Inst::orw_mi(orw_mi {
1960            rm16: RAX_RM,
1961            imm16,
1962        }) => orw_i::<R>::new(RAX, imm16).encode(sink),
1963        Inst::orl_mi(orl_mi {
1964            rm32: RAX_RM,
1965            imm32,
1966        }) => orl_i::<R>::new(RAX, imm32).encode(sink),
1967        Inst::orq_mi_sxl(orq_mi_sxl {
1968            rm64: RAX_RM,
1969            imm32,
1970        }) => orq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1971
1972        // xor
1973        Inst::xorb_mi(xorb_mi { rm8: RAX_RM, imm8 }) => xorb_i::<R>::new(RAX, imm8).encode(sink),
1974        Inst::xorw_mi(xorw_mi {
1975            rm16: RAX_RM,
1976            imm16,
1977        }) => xorw_i::<R>::new(RAX, imm16).encode(sink),
1978        Inst::xorl_mi(xorl_mi {
1979            rm32: RAX_RM,
1980            imm32,
1981        }) => xorl_i::<R>::new(RAX, imm32).encode(sink),
1982        Inst::xorq_mi_sxl(xorq_mi_sxl {
1983            rm64: RAX_RM,
1984            imm32,
1985        }) => xorq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1986
1987        // add
1988        Inst::addb_mi(addb_mi { rm8: RAX_RM, imm8 }) => addb_i::<R>::new(RAX, imm8).encode(sink),
1989        Inst::addw_mi(addw_mi {
1990            rm16: RAX_RM,
1991            imm16,
1992        }) => addw_i::<R>::new(RAX, imm16).encode(sink),
1993        Inst::addl_mi(addl_mi {
1994            rm32: RAX_RM,
1995            imm32,
1996        }) => addl_i::<R>::new(RAX, imm32).encode(sink),
1997        Inst::addq_mi_sxl(addq_mi_sxl {
1998            rm64: RAX_RM,
1999            imm32,
2000        }) => addq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2001
2002        // adc
2003        Inst::adcb_mi(adcb_mi { rm8: RAX_RM, imm8 }) => adcb_i::<R>::new(RAX, imm8).encode(sink),
2004        Inst::adcw_mi(adcw_mi {
2005            rm16: RAX_RM,
2006            imm16,
2007        }) => adcw_i::<R>::new(RAX, imm16).encode(sink),
2008        Inst::adcl_mi(adcl_mi {
2009            rm32: RAX_RM,
2010            imm32,
2011        }) => adcl_i::<R>::new(RAX, imm32).encode(sink),
2012        Inst::adcq_mi_sxl(adcq_mi_sxl {
2013            rm64: RAX_RM,
2014            imm32,
2015        }) => adcq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2016
2017        // sub
2018        Inst::subb_mi(subb_mi { rm8: RAX_RM, imm8 }) => subb_i::<R>::new(RAX, imm8).encode(sink),
2019        Inst::subw_mi(subw_mi {
2020            rm16: RAX_RM,
2021            imm16,
2022        }) => subw_i::<R>::new(RAX, imm16).encode(sink),
2023        Inst::subl_mi(subl_mi {
2024            rm32: RAX_RM,
2025            imm32,
2026        }) => subl_i::<R>::new(RAX, imm32).encode(sink),
2027        Inst::subq_mi_sxl(subq_mi_sxl {
2028            rm64: RAX_RM,
2029            imm32,
2030        }) => subq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2031
2032        // sbb
2033        Inst::sbbb_mi(sbbb_mi { rm8: RAX_RM, imm8 }) => sbbb_i::<R>::new(RAX, imm8).encode(sink),
2034        Inst::sbbw_mi(sbbw_mi {
2035            rm16: RAX_RM,
2036            imm16,
2037        }) => sbbw_i::<R>::new(RAX, imm16).encode(sink),
2038        Inst::sbbl_mi(sbbl_mi {
2039            rm32: RAX_RM,
2040            imm32,
2041        }) => sbbl_i::<R>::new(RAX, imm32).encode(sink),
2042        Inst::sbbq_mi_sxl(sbbq_mi_sxl {
2043            rm64: RAX_RM,
2044            imm32,
2045        }) => sbbq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2046
2047        // cmp
2048        Inst::cmpb_mi(cmpb_mi {
2049            rm8: GprMem::Gpr(Gpr::RAX),
2050            imm8,
2051        }) => cmpb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2052        Inst::cmpw_mi(cmpw_mi {
2053            rm16: GprMem::Gpr(Gpr::RAX),
2054            imm16,
2055        }) => cmpw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2056        Inst::cmpl_mi(cmpl_mi {
2057            rm32: GprMem::Gpr(Gpr::RAX),
2058            imm32,
2059        }) => cmpl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2060        Inst::cmpq_mi(cmpq_mi {
2061            rm64: GprMem::Gpr(Gpr::RAX),
2062            imm32,
2063        }) => cmpq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2064
2065        // test
2066        Inst::testb_mi(testb_mi {
2067            rm8: GprMem::Gpr(Gpr::RAX),
2068            imm8,
2069        }) => testb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2070        Inst::testw_mi(testw_mi {
2071            rm16: GprMem::Gpr(Gpr::RAX),
2072            imm16,
2073        }) => testw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2074        Inst::testl_mi(testl_mi {
2075            rm32: GprMem::Gpr(Gpr::RAX),
2076            imm32,
2077        }) => testl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2078        Inst::testq_mi(testq_mi {
2079            rm64: GprMem::Gpr(Gpr::RAX),
2080            imm32,
2081        }) => testq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2082
2083        // lea
2084        Inst::leal_rm(leal_rm { r32, m32 }) => emit_lea(
2085            r32,
2086            m32,
2087            sink,
2088            |dst, amode, s| leal_rm::<R>::new(dst, amode).encode(s),
2089            |dst, simm32, s| addl_mi::<R>::new(dst, simm32.cast_unsigned()).encode(s),
2090            |dst, reg, s| addl_rm::<R>::new(dst, reg).encode(s),
2091        ),
2092        Inst::leaq_rm(leaq_rm { r64, m64 }) => emit_lea(
2093            r64,
2094            m64,
2095            sink,
2096            |dst, amode, s| leaq_rm::<R>::new(dst, amode).encode(s),
2097            |dst, simm32, s| addq_mi_sxl::<R>::new(dst, simm32).encode(s),
2098            |dst, reg, s| addq_rm::<R>::new(dst, reg).encode(s),
2099        ),
2100
2101        // All other instructions fall through to here and cannot be shrunk, so
2102        // return `false` to emit them as usual.
2103        _ => inst.encode(sink),
2104    }
2105}
2106
2107/// If `lea` can actually get encoded as an `add` then do that instead.
2108/// Currently all candidate `iadd`s become an `lea` pseudo-instruction here but
2109/// maximizing the use of `lea` is not necessarily optimal. The `lea`
2110/// instruction goes through dedicated address units on cores which are finite
2111/// and disjoint from the general ALU, so if everything uses `lea` then those
2112/// units can get saturated while leaving the ALU idle.
2113///
2114/// To help make use of more parts of a CPU, this attempts to use `add` when
2115/// it's semantically equivalent to `lea`, or otherwise when the `dst` register
2116/// is the same as the `base` or `index` register.
2117///
2118/// FIXME: ideally regalloc is informed of this constraint. Register allocation
2119/// of `lea` should "attempt" to put the `base` in the same register as `dst`
2120/// but not at the expense of generating a `mov` instruction. Currently that's
2121/// not possible but perhaps one day it may be worth it.
2122fn emit_lea<S>(
2123    dst: asm::Gpr<WritableGpr>,
2124    addr: asm::Amode<Gpr>,
2125    sink: &mut S,
2126    lea: fn(WritableGpr, asm::Amode<Gpr>, &mut S),
2127    add_mi: fn(PairedGpr, i32, &mut S),
2128    add_rm: fn(PairedGpr, Gpr, &mut S),
2129) where
2130    S: asm::CodeSink,
2131{
2132    match addr {
2133        // If `base == dst` then this is `add dst, $imm`, so encode that
2134        // instead.
2135        asm::Amode::ImmReg {
2136            base,
2137            simm32:
2138                asm::AmodeOffsetPlusKnownOffset {
2139                    simm32,
2140                    offset: None,
2141                },
2142            trap: None,
2143        } if dst.as_ref().to_reg() == base => add_mi(
2144            PairedGpr {
2145                read: base,
2146                write: *dst.as_ref(),
2147            },
2148            simm32.value(),
2149            sink,
2150        ),
2151
2152        // If the offset is 0 and the shift is a scale of 1, then:
2153        //
2154        // * If `base == dst`, then this is `addq dst, index`
2155        // * If `index == dst`, then this is `addq dst, base`
2156        asm::Amode::ImmRegRegShift {
2157            base,
2158            index,
2159            scale: asm::Scale::One,
2160            simm32: asm::AmodeOffset::ZERO,
2161            trap: None,
2162        } => {
2163            if dst.as_ref().to_reg() == base {
2164                add_rm(
2165                    PairedGpr {
2166                        read: base,
2167                        write: *dst.as_ref(),
2168                    },
2169                    *index.as_ref(),
2170                    sink,
2171                )
2172            } else if dst.as_ref().to_reg() == *index.as_ref() {
2173                add_rm(
2174                    PairedGpr {
2175                        read: *index.as_ref(),
2176                        write: *dst.as_ref(),
2177                    },
2178                    base,
2179                    sink,
2180                )
2181            } else {
2182                lea(*dst.as_ref(), addr, sink)
2183            }
2184        }
2185
2186        _ => lea(*dst.as_ref(), addr, sink),
2187    }
2188}