rustc_codegen_nvvm 0.3.0

A codegen backend for Rustc which targets the libnvvm CUDA library
; This is a hand-written llvm ir module which contains extra functions
; that are easier to write. They mostly contain nvvm intrinsics that are wrapped in new 
; functions so that rustc does not think they are llvm intrinsics and so you don't need to always use nightly for that.
;
; if you update this make sure to update libintrinsics.bc by running llvm-as (make sure you are using llvm-7 or it won't work when
; loaded into libnvvm).
source_filename = "libintrinsics"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

; thread ----

define i32 @__nvvm_thread_idx_x() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
}

define i32 @__nvvm_thread_idx_y() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
}

define i32 @__nvvm_thread_idx_z() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
  ret i32 %0
}

; block dimension ----

define i32 @__nvvm_block_dim_x() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  ret i32 %0
}

define i32 @__nvvm_block_dim_y() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
  ret i32 %0
}

define i32 @__nvvm_block_dim_z() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
  ret i32 %0
}

; block idx ----

define i32 @__nvvm_block_idx_x() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
}

define i32 @__nvvm_block_idx_y() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
}

define i32 @__nvvm_block_idx_z() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
  ret i32 %0
}

; grid dimension ---- 

define i32 @__nvvm_grid_dim_x() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  ret i32 %0
}

define i32 @__nvvm_grid_dim_y() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
  ret i32 %0
}

define i32 @__nvvm_grid_dim_z() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
  ret i32 %0
}

; warp ----

define i32 @__nvvm_warp_size() #0 {
start:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
  ret i32 %0
}

declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()

; other ----

define void @__nvvm_block_barrier() #1 {
start:
  call void @llvm.nvvm.barrier0()
  ret void
}

declare void @llvm.nvvm.barrier0()

define void @__nvvm_grid_fence() #1 {
start:
  call void @llvm.nvvm.membar.cta()
  ret void
}

declare void @llvm.nvvm.membar.cta()

define void @__nvvm_device_fence() #1 {
start:
  call void @llvm.nvvm.membar.gl()
  ret void
}

declare void @llvm.nvvm.membar.gl()

define void @__nvvm_system_fence() #1 {
start:
  call void @llvm.nvvm.membar.sys()
  ret void
}

declare void @llvm.nvvm.membar.sys()

define void @__nvvm_trap() #1 {
start:
  call void @llvm.trap()
  unreachable
  ret void
}

declare void @llvm.trap()

; math stuff -------------

define {i8, i1} @__nvvm_i8_addo(i8, i8) #0 {
start:
  %2 = sext i8 %0 to i16
  %3 = sext i8 %1 to i16
  %4 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %2, i16 %3)
  %5 = extractvalue {i16, i1} %4, 0
  %6 = extractvalue {i16, i1} %4, 1
  %7 = trunc i16 %5 to i8
  %8 = insertvalue {i8, i1} undef, i8 %7, 0
  %9 = insertvalue {i8, i1} %8, i1 %6, 1
  ret {i8, i1} %9
}
declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) #0

define {i8, i1} @__nvvm_u8_addo(i8, i8) #0 {
start:
  %2 = sext i8 %0 to i16
  %3 = sext i8 %1 to i16
  %4 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %2, i16 %3)
  %5 = extractvalue {i16, i1} %4, 0
  %6 = extractvalue {i16, i1} %4, 1
  %7 = trunc i16 %5 to i8
  %8 = insertvalue {i8, i1} undef, i8 %7, 0
  %9 = insertvalue {i8, i1} %8, i1 %6, 1
  ret {i8, i1} %9
}
declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) #0

define {i8, i1} @__nvvm_i8_subo(i8, i8) #0 {
start:
  %2 = sext i8 %0 to i16
  %3 = sext i8 %1 to i16
  %4 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %2, i16 %3)
  %5 = extractvalue {i16, i1} %4, 0
  %6 = extractvalue {i16, i1} %4, 1
  %7 = trunc i16 %5 to i8
  %8 = insertvalue {i8, i1} undef, i8 %7, 0
  %9 = insertvalue {i8, i1} %8, i1 %6, 1
  ret {i8, i1} %9
}
declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) #0

define {i8, i1} @__nvvm_u8_subo(i8, i8) #0 {
start:
  %2 = sext i8 %0 to i16
  %3 = sext i8 %1 to i16
  %4 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %2, i16 %3)
  %5 = extractvalue {i16, i1} %4, 0
  %6 = extractvalue {i16, i1} %4, 1
  %7 = trunc i16 %5 to i8
  %8 = insertvalue {i8, i1} undef, i8 %7, 0
  %9 = insertvalue {i8, i1} %8, i1 %6, 1
  ret {i8, i1} %9
}
declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) #0

define {i8, i1} @__nvvm_i8_mulo(i8, i8) #0 {
start:
  %2 = sext i8 %0 to i16
  %3 = sext i8 %1 to i16
  %4 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %2, i16 %3)
  %5 = extractvalue {i16, i1} %4, 0
  %6 = extractvalue {i16, i1} %4, 1
  %7 = trunc i16 %5 to i8
  %8 = insertvalue {i8, i1} undef, i8 %7, 0
  %9 = insertvalue {i8, i1} %8, i1 %6, 1
  ret {i8, i1} %9
}
declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) #0

define {i8, i1} @__nvvm_u8_mulo(i8, i8) #0 {
start:
  %2 = sext i8 %0 to i16
  %3 = sext i8 %1 to i16
  %4 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %2, i16 %3)
  %5 = extractvalue {i16, i1} %4, 0
  %6 = extractvalue {i16, i1} %4, 1
  %7 = trunc i16 %5 to i8
  %8 = insertvalue {i8, i1} undef, i8 %7, 0
  %9 = insertvalue {i8, i1} %8, i1 %6, 1
  ret {i8, i1} %9
}
declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) #0

; This is a bit weird, we need to use functions defined in rust crates (compiler_builtins)
; as intrinsics in the codegen, but we can't directly use their name, otherwise we will have
; really odd and incorrect behavior in the crate theyre defined in. So we need to make a wrapper for them that is opaque
; to the codegen, which is what this is doing.

define {<2 x i64>, i1} @__nvvm_i128_addo(<2 x i64>, <2 x i64>) #0 {
start:
  %2 = call {<2 x i64>, i1} @__rust_i128_addo(<2 x i64> %0, <2 x i64> %1)
  ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_i128_addo(<2 x i64>, <2 x i64>) #0

define {<2 x i64>, i1} @__nvvm_u128_addo(<2 x i64>, <2 x i64>) #0 {
start:
  %2 = call {<2 x i64>, i1} @__rust_u128_addo(<2 x i64> %0, <2 x i64> %1)
  ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_u128_addo(<2 x i64>, <2 x i64>) #0

define {<2 x i64>, i1} @__nvvm_i128_subo(<2 x i64>, <2 x i64>) #0 {
start:
  %2 = call {<2 x i64>, i1} @__rust_i128_subo(<2 x i64> %0, <2 x i64> %1)
  ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_i128_subo(<2 x i64>, <2 x i64>) #0

define {<2 x i64>, i1} @__nvvm_u128_subo(<2 x i64>, <2 x i64>) #0 {
start:
  %2 = call {<2 x i64>, i1} @__rust_u128_subo(<2 x i64> %0, <2 x i64> %1)
  ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_u128_subo(<2 x i64>, <2 x i64>) #0

define {<2 x i64>, i1} @__nvvm_i128_mulo(<2 x i64>, <2 x i64>) #0 {
start:
  %2 = call {<2 x i64>, i1} @__rust_i128_mulo(<2 x i64> %0, <2 x i64> %1)
  ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_i128_mulo(<2 x i64>, <2 x i64>) #0

define {<2 x i64>, i1} @__nvvm_u128_mulo(<2 x i64>, <2 x i64>) #0 {
start:
  %2 = call {<2 x i64>, i1} @__rust_u128_mulo(<2 x i64> %0, <2 x i64> %1)
  ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_u128_mulo(<2 x i64>, <2 x i64>) #0

attributes #0 = { alwaysinline speculatable }
attributes #1 = { alwaysinline }