; This is a hand-written llvm ir module which contains extra functions
; that are easier to write. They mostly contain nvvm intrinsics that are wrapped in new
; functions so that rustc does not think they are llvm intrinsics and so you don't need to always use nightly for that.
;
; if you update this make sure to update libintrinsics.bc by running llvm-as (make sure you are using llvm-7 or it won't work when
; loaded into libnvvm).
source_filename = "libintrinsics"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
; thread ----
define i32 @__nvvm_thread_idx_x() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
define i32 @__nvvm_thread_idx_y() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
define i32 @__nvvm_thread_idx_z() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
ret i32 %0
}
; block dimension ----
define i32 @__nvvm_block_dim_x() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %0
}
define i32 @__nvvm_block_dim_y() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %0
}
define i32 @__nvvm_block_dim_z() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
ret i32 %0
}
; block idx ----
define i32 @__nvvm_block_idx_x() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
define i32 @__nvvm_block_idx_y() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
define i32 @__nvvm_block_idx_z() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
ret i32 %0
}
; grid dimension ----
define i32 @__nvvm_grid_dim_x() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
ret i32 %0
}
define i32 @__nvvm_grid_dim_y() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
ret i32 %0
}
define i32 @__nvvm_grid_dim_z() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
ret i32 %0
}
; warp ----
define i32 @__nvvm_warp_size() #0 {
start:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
ret i32 %0
}
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
; other ----
define void @__nvvm_block_barrier() #1 {
start:
call void @llvm.nvvm.barrier0()
ret void
}
declare void @llvm.nvvm.barrier0()
define void @__nvvm_grid_fence() #1 {
start:
call void @llvm.nvvm.membar.cta()
ret void
}
declare void @llvm.nvvm.membar.cta()
define void @__nvvm_device_fence() #1 {
start:
call void @llvm.nvvm.membar.gl()
ret void
}
declare void @llvm.nvvm.membar.gl()
define void @__nvvm_system_fence() #1 {
start:
call void @llvm.nvvm.membar.sys()
ret void
}
declare void @llvm.nvvm.membar.sys()
define void @__nvvm_trap() #1 {
start:
call void @llvm.trap()
unreachable
ret void
}
declare void @llvm.trap()
; math stuff -------------
define {i8, i1} @__nvvm_i8_addo(i8, i8) #0 {
start:
%2 = sext i8 %0 to i16
%3 = sext i8 %1 to i16
%4 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %2, i16 %3)
%5 = extractvalue {i16, i1} %4, 0
%6 = extractvalue {i16, i1} %4, 1
%7 = trunc i16 %5 to i8
%8 = insertvalue {i8, i1} undef, i8 %7, 0
%9 = insertvalue {i8, i1} %8, i1 %6, 1
ret {i8, i1} %9
}
declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) #0
define {i8, i1} @__nvvm_u8_addo(i8, i8) #0 {
start:
%2 = sext i8 %0 to i16
%3 = sext i8 %1 to i16
%4 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %2, i16 %3)
%5 = extractvalue {i16, i1} %4, 0
%6 = extractvalue {i16, i1} %4, 1
%7 = trunc i16 %5 to i8
%8 = insertvalue {i8, i1} undef, i8 %7, 0
%9 = insertvalue {i8, i1} %8, i1 %6, 1
ret {i8, i1} %9
}
declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) #0
define {i8, i1} @__nvvm_i8_subo(i8, i8) #0 {
start:
%2 = sext i8 %0 to i16
%3 = sext i8 %1 to i16
%4 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %2, i16 %3)
%5 = extractvalue {i16, i1} %4, 0
%6 = extractvalue {i16, i1} %4, 1
%7 = trunc i16 %5 to i8
%8 = insertvalue {i8, i1} undef, i8 %7, 0
%9 = insertvalue {i8, i1} %8, i1 %6, 1
ret {i8, i1} %9
}
declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) #0
define {i8, i1} @__nvvm_u8_subo(i8, i8) #0 {
start:
%2 = sext i8 %0 to i16
%3 = sext i8 %1 to i16
%4 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %2, i16 %3)
%5 = extractvalue {i16, i1} %4, 0
%6 = extractvalue {i16, i1} %4, 1
%7 = trunc i16 %5 to i8
%8 = insertvalue {i8, i1} undef, i8 %7, 0
%9 = insertvalue {i8, i1} %8, i1 %6, 1
ret {i8, i1} %9
}
declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) #0
define {i8, i1} @__nvvm_i8_mulo(i8, i8) #0 {
start:
%2 = sext i8 %0 to i16
%3 = sext i8 %1 to i16
%4 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %2, i16 %3)
%5 = extractvalue {i16, i1} %4, 0
%6 = extractvalue {i16, i1} %4, 1
%7 = trunc i16 %5 to i8
%8 = insertvalue {i8, i1} undef, i8 %7, 0
%9 = insertvalue {i8, i1} %8, i1 %6, 1
ret {i8, i1} %9
}
declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) #0
define {i8, i1} @__nvvm_u8_mulo(i8, i8) #0 {
start:
%2 = sext i8 %0 to i16
%3 = sext i8 %1 to i16
%4 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %2, i16 %3)
%5 = extractvalue {i16, i1} %4, 0
%6 = extractvalue {i16, i1} %4, 1
%7 = trunc i16 %5 to i8
%8 = insertvalue {i8, i1} undef, i8 %7, 0
%9 = insertvalue {i8, i1} %8, i1 %6, 1
ret {i8, i1} %9
}
declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) #0
; This is a bit weird, we need to use functions defined in rust crates (compiler_builtins)
; as intrinsics in the codegen, but we can't directly use their name, otherwise we will have
; really odd and incorrect behavior in the crate theyre defined in. So we need to make a wrapper for them that is opaque
; to the codegen, which is what this is doing.
define {<2 x i64>, i1} @__nvvm_i128_addo(<2 x i64>, <2 x i64>) #0 {
start:
%2 = call {<2 x i64>, i1} @__rust_i128_addo(<2 x i64> %0, <2 x i64> %1)
ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_i128_addo(<2 x i64>, <2 x i64>) #0
define {<2 x i64>, i1} @__nvvm_u128_addo(<2 x i64>, <2 x i64>) #0 {
start:
%2 = call {<2 x i64>, i1} @__rust_u128_addo(<2 x i64> %0, <2 x i64> %1)
ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_u128_addo(<2 x i64>, <2 x i64>) #0
define {<2 x i64>, i1} @__nvvm_i128_subo(<2 x i64>, <2 x i64>) #0 {
start:
%2 = call {<2 x i64>, i1} @__rust_i128_subo(<2 x i64> %0, <2 x i64> %1)
ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_i128_subo(<2 x i64>, <2 x i64>) #0
define {<2 x i64>, i1} @__nvvm_u128_subo(<2 x i64>, <2 x i64>) #0 {
start:
%2 = call {<2 x i64>, i1} @__rust_u128_subo(<2 x i64> %0, <2 x i64> %1)
ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_u128_subo(<2 x i64>, <2 x i64>) #0
define {<2 x i64>, i1} @__nvvm_i128_mulo(<2 x i64>, <2 x i64>) #0 {
start:
%2 = call {<2 x i64>, i1} @__rust_i128_mulo(<2 x i64> %0, <2 x i64> %1)
ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_i128_mulo(<2 x i64>, <2 x i64>) #0
define {<2 x i64>, i1} @__nvvm_u128_mulo(<2 x i64>, <2 x i64>) #0 {
start:
%2 = call {<2 x i64>, i1} @__rust_u128_mulo(<2 x i64> %0, <2 x i64> %1)
ret {<2 x i64>, i1} %2
}
declare {<2 x i64>, i1} @__rust_u128_mulo(<2 x i64>, <2 x i64>) #0
attributes #0 = { alwaysinline speculatable }
attributes #1 = { alwaysinline }