fn ptx_header(sm: u32) -> String {
let ptx_ver = if sm >= 100 {
"8.7"
} else if sm >= 90 {
"8.4"
} else if sm >= 80 {
"8.0"
} else {
"7.5"
};
format!(".version {ptx_ver}\n.target sm_{sm}\n.address_size 64\n\n")
}
pub fn f32_hex(v: f32) -> String {
format!("0F{:08X}", v.to_bits())
}
pub fn csr_spmv_ptx(sm: u32) -> String {
let hdr = ptx_header(sm);
format!(
r#"{hdr}.visible .entry csr_spmv(
.param .u64 p_row_ptr,
.param .u64 p_col_idx,
.param .u64 p_val,
.param .u64 p_x,
.param .u64 p_y,
.param .u32 n_rows
)
{{
.reg .u64 %rd<16>;
.reg .u32 %r<16>;
.reg .f32 %f<8>;
.reg .pred %p<4>;
// Load parameters
ld.param.u64 %rd0, [p_row_ptr];
ld.param.u64 %rd1, [p_col_idx];
ld.param.u64 %rd2, [p_val];
ld.param.u64 %rd3, [p_x];
ld.param.u64 %rd4, [p_y];
ld.param.u32 %r0, [n_rows];
// tid = blockDim.x * blockIdx.x + threadIdx.x
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.u32 %r4, %r1, %r2, %r3; // r4 = tid
// row = tid / 32, lane = tid % 32
shr.u32 %r5, %r4, 5; // r5 = row = tid >> 5
and.b32 %r6, %r4, 31; // r6 = lane = tid & 31
// if row >= n_rows, exit
setp.ge.u32 %p0, %r5, %r0;
@%p0 bra $DONE;
// row_start = row_ptr[row], row_end = row_ptr[row+1]
mul.wide.u32 %rd5, %r5, 4;
add.u64 %rd5, %rd0, %rd5;
ld.global.u32 %r7, [%rd5]; // r7 = row_start
add.u32 %r8, %r5, 1;
mul.wide.u32 %rd6, %r8, 4;
add.u64 %rd6, %rd0, %rd6;
ld.global.u32 %r8, [%rd6]; // r8 = row_end
// Warp iterates: e = row_start + lane, step = 32
add.u32 %r9, %r7, %r6; // r9 = e = row_start + lane
mov.f32 %f0, 0F00000000; // f0 = partial_sum = 0.0
$LOOP:
setp.ge.u32 %p1, %r9, %r8;
@%p1 bra $REDUCE;
// col = col_idx[e]
mul.wide.u32 %rd7, %r9, 4;
add.u64 %rd7, %rd1, %rd7;
ld.global.u32 %r10, [%rd7]; // r10 = col
// val = p_val[e]
mul.wide.u32 %rd8, %r9, 4;
add.u64 %rd8, %rd2, %rd8;
ld.global.f32 %f1, [%rd8]; // f1 = val
// x_col = p_x[col]
mul.wide.u32 %rd9, %r10, 4;
add.u64 %rd9, %rd3, %rd9;
ld.global.f32 %f2, [%rd9]; // f2 = x[col]
fma.rn.f32 %f0, %f1, %f2, %f0; // f0 += val * x[col]
add.u32 %r9, %r9, 32; // e += warp_size
bra $LOOP;
$REDUCE:
// Warp-level reduction using shfl_down (butterfly pattern across 16-8-4-2-1 lanes)
shfl.sync.down.b32 %f3, %f0, 16, 31, 0xFFFFFFFF;
add.f32 %f0, %f0, %f3;
shfl.sync.down.b32 %f3, %f0, 8, 31, 0xFFFFFFFF;
add.f32 %f0, %f0, %f3;
shfl.sync.down.b32 %f3, %f0, 4, 31, 0xFFFFFFFF;
add.f32 %f0, %f0, %f3;
shfl.sync.down.b32 %f3, %f0, 2, 31, 0xFFFFFFFF;
add.f32 %f0, %f0, %f3;
shfl.sync.down.b32 %f3, %f0, 1, 31, 0xFFFFFFFF;
add.f32 %f0, %f0, %f3;
// Only lane 0 writes result
setp.ne.u32 %p2, %r6, 0;
@%p2 bra $DONE;
mul.wide.u32 %rd10, %r5, 4;
add.u64 %rd10, %rd4, %rd10;
st.global.f32 [%rd10], %f0;
$DONE:
ret;
}}
"#
)
}
pub fn scatter_add_ptx(sm: u32) -> String {
let hdr = ptx_header(sm);
format!(
r#"{hdr}.visible .entry scatter_add(
.param .u64 p_idx,
.param .u64 p_src,
.param .u64 p_out,
.param .u32 n
)
{{
.reg .u64 %rd<8>;
.reg .u32 %r<8>;
.reg .f32 %f<2>;
.reg .pred %p0;
ld.param.u64 %rd0, [p_idx];
ld.param.u64 %rd1, [p_src];
ld.param.u64 %rd2, [p_out];
ld.param.u32 %r0, [n];
// tid = blockDim.x * blockIdx.x + threadIdx.x
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.u32 %r4, %r1, %r2, %r3;
setp.ge.u32 %p0, %r4, %r0;
@%p0 bra $DONE;
// dest_idx = idx[tid]
mul.wide.u32 %rd3, %r4, 4;
add.u64 %rd3, %rd0, %rd3;
ld.global.u32 %r5, [%rd3];
// val = src[tid]
mul.wide.u32 %rd4, %r4, 4;
add.u64 %rd4, %rd1, %rd4;
ld.global.f32 %f0, [%rd4];
// atom.add out[dest_idx] += val
mul.wide.u32 %rd5, %r5, 4;
add.u64 %rd5, %rd2, %rd5;
atom.global.add.f32 %f1, [%rd5], %f0;
$DONE:
ret;
}}
"#
)
}
pub fn gat_attention_ptx(sm: u32) -> String {
let hdr = ptx_header(sm);
let leaky_slope = f32_hex(0.2_f32);
format!(
r#"{hdr}.visible .entry gat_attention(
.param .u64 p_src_feat,
.param .u64 p_dst_feat,
.param .u64 p_a,
.param .u64 p_score,
.param .u32 feat_dim,
.param .u32 n_edges
)
{{
.reg .u64 %rd<12>;
.reg .u32 %r<12>;
.reg .f32 %f<8>;
.reg .pred %p<4>;
ld.param.u64 %rd0, [p_src_feat];
ld.param.u64 %rd1, [p_dst_feat];
ld.param.u64 %rd2, [p_a];
ld.param.u64 %rd3, [p_score];
ld.param.u32 %r0, [feat_dim];
ld.param.u32 %r1, [n_edges];
// tid
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
mad.lo.u32 %r5, %r2, %r3, %r4; // r5 = edge_id
setp.ge.u32 %p0, %r5, %r1;
@%p0 bra $DONE;
// base offset for this edge in feat arrays: edge_id * feat_dim * 4
mul.lo.u32 %r6, %r5, %r0; // r6 = edge_id * feat_dim
mul.wide.u32 %rd4, %r6, 4; // rd4 = byte offset
// dot product: sum_k src[edge*fd + k] * a[k] + dst[edge*fd + k] * a[fd + k]
mov.f32 %f0, 0F00000000; // accumulator = 0
mov.u32 %r7, 0; // k = 0
$LOOP:
setp.ge.u32 %p1, %r7, %r0;
@%p1 bra $POSTLOOP;
// byte offset for element k within this edge's feature slice
mul.wide.u32 %rd5, %r7, 4;
// src_feat[edge*fd + k]
add.u64 %rd6, %rd0, %rd4;
add.u64 %rd6, %rd6, %rd5;
ld.global.f32 %f1, [%rd6];
// a[k]
mul.wide.u32 %rd7, %r7, 4;
add.u64 %rd7, %rd2, %rd7;
ld.global.f32 %f2, [%rd7];
fma.rn.f32 %f0, %f1, %f2, %f0; // accum += src[k] * a[k]
// dst_feat[edge*fd + k]
add.u64 %rd8, %rd1, %rd4;
add.u64 %rd8, %rd8, %rd5;
ld.global.f32 %f3, [%rd8];
// a[fd + k]
add.u32 %r8, %r7, %r0; // fd + k
mul.wide.u32 %rd9, %r8, 4;
add.u64 %rd9, %rd2, %rd9;
ld.global.f32 %f4, [%rd9];
fma.rn.f32 %f0, %f3, %f4, %f0; // accum += dst[k] * a[fd+k]
add.u32 %r7, %r7, 1;
bra $LOOP;
$POSTLOOP:
// LeakyReLU: if x < 0 then 0.2 * x else x
mov.f32 %f5, {leaky_slope};
setp.lt.f32 %p2, %f0, 0F00000000;
mul.f32 %f6, %f0, %f5;
selp.f32 %f0, %f6, %f0, %p2;
// store score[edge_id]
mul.wide.u32 %rd10, %r5, 4;
add.u64 %rd10, %rd3, %rd10;
st.global.f32 [%rd10], %f0;
$DONE:
ret;
}}
"#
)
}
pub fn softmax_edge_ptx(sm: u32) -> String {
let hdr = ptx_header(sm);
format!(
r#"{hdr}.visible .entry softmax_edge(
.param .u64 p_score,
.param .u64 p_row_ptr,
.param .u64 p_alpha,
.param .u32 n_nodes
)
{{
.reg .u64 %rd<10>;
.reg .u32 %r<12>;
.reg .f32 %f<8>;
.reg .pred %p<4>;
ld.param.u64 %rd0, [p_score];
ld.param.u64 %rd1, [p_row_ptr];
ld.param.u64 %rd2, [p_alpha];
ld.param.u32 %r0, [n_nodes];
// tid = node_id
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.u32 %r4, %r1, %r2, %r3;
setp.ge.u32 %p0, %r4, %r0;
@%p0 bra $DONE;
// row_start = row_ptr[node_id]
mul.wide.u32 %rd3, %r4, 4;
add.u64 %rd3, %rd1, %rd3;
ld.global.u32 %r5, [%rd3]; // r5 = row_start
// row_end = row_ptr[node_id + 1]
add.u32 %r6, %r4, 1;
mul.wide.u32 %rd4, %r6, 4;
add.u64 %rd4, %rd1, %rd4;
ld.global.u32 %r6, [%rd4]; // r6 = row_end
// If no outgoing edges, skip
setp.ge.u32 %p1, %r5, %r6;
@%p1 bra $DONE;
// Pass 1: find max score in this node's out-edges
mov.f32 %f0, 0FFF800000; // -inf
mov.u32 %r7, %r5; // e = row_start
$MAXLOOP:
setp.ge.u32 %p2, %r7, %r6;
@%p2 bra $EXPLOOP_INIT;
mul.wide.u32 %rd5, %r7, 4;
add.u64 %rd5, %rd0, %rd5;
ld.global.f32 %f1, [%rd5];
max.f32 %f0, %f0, %f1;
add.u32 %r7, %r7, 1;
bra $MAXLOOP;
$EXPLOOP_INIT:
// Pass 2: compute sum of exp(score - max)
mov.f32 %f2, 0F00000000; // sum = 0
mov.u32 %r7, %r5;
$SUMLOOP:
setp.ge.u32 %p2, %r7, %r6;
@%p2 bra $NORMLOOP_INIT;
mul.wide.u32 %rd5, %r7, 4;
add.u64 %rd5, %rd0, %rd5;
ld.global.f32 %f3, [%rd5];
sub.f32 %f3, %f3, %f0; // score - max
ex2.approx.f32 %f3, %f3; // 2^x approximation for exp
add.f32 %f2, %f2, %f3;
add.u32 %r7, %r7, 1;
bra $SUMLOOP;
$NORMLOOP_INIT:
// Pass 3: normalise and store
mov.u32 %r7, %r5;
$NORMLOOP:
setp.ge.u32 %p2, %r7, %r6;
@%p2 bra $DONE;
mul.wide.u32 %rd5, %r7, 4;
add.u64 %rd5, %rd0, %rd5;
ld.global.f32 %f4, [%rd5];
sub.f32 %f4, %f4, %f0;
ex2.approx.f32 %f4, %f4;
div.approx.f32 %f4, %f4, %f2;
mul.wide.u32 %rd6, %r7, 4;
add.u64 %rd6, %rd2, %rd6;
st.global.f32 [%rd6], %f4;
add.u32 %r7, %r7, 1;
bra $NORMLOOP;
$DONE:
ret;
}}
"#
)
}
pub fn aggregate_mean_ptx(sm: u32) -> String {
let hdr = ptx_header(sm);
format!(
r#"{hdr}.visible .entry aggregate_mean(
.param .u64 p_feat,
.param .u64 p_row_ptr,
.param .u64 p_col_idx,
.param .u64 p_out,
.param .u32 feat_dim,
.param .u32 n_nodes
)
{{
.reg .u64 %rd<12>;
.reg .u32 %r<14>;
.reg .f32 %f<4>;
.reg .pred %p<4>;
ld.param.u64 %rd0, [p_feat];
ld.param.u64 %rd1, [p_row_ptr];
ld.param.u64 %rd2, [p_col_idx];
ld.param.u64 %rd3, [p_out];
ld.param.u32 %r0, [feat_dim];
ld.param.u32 %r1, [n_nodes];
// tid = blockDim.x * blockIdx.x + threadIdx.x
// Each thread handles one (node, feature_dim) pair
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
mad.lo.u32 %r5, %r2, %r3, %r4; // tid
// node_id = tid / feat_dim, k = tid % feat_dim
div.u32 %r6, %r5, %r0; // node_id
rem.u32 %r7, %r5, %r0; // k
// guard: node_id < n_nodes
mul.lo.u32 %r8, %r1, %r0; // n_nodes * feat_dim
setp.ge.u32 %p0, %r5, %r8;
@%p0 bra $DONE;
// row_start = row_ptr[node_id]
mul.wide.u32 %rd4, %r6, 4;
add.u64 %rd4, %rd1, %rd4;
ld.global.u32 %r9, [%rd4];
// row_end = row_ptr[node_id + 1]
add.u32 %r10, %r6, 1;
mul.wide.u32 %rd5, %r10, 4;
add.u64 %rd5, %rd1, %rd5;
ld.global.u32 %r10, [%rd5];
// degree = row_end - row_start
sub.u32 %r11, %r10, %r9;
// If isolated node, write 0 and exit
mov.f32 %f0, 0F00000000;
setp.eq.u32 %p1, %r11, 0;
@%p1 bra $WRITE;
// Accumulate sum over neighbours
mov.u32 %r12, %r9; // e = row_start
$LOOP:
setp.ge.u32 %p2, %r12, %r10;
@%p2 bra $NORMALIZE;
// neighbour = col_idx[e]
mul.wide.u32 %rd6, %r12, 4;
add.u64 %rd6, %rd2, %rd6;
ld.global.u32 %r13, [%rd6];
// feat[neighbour*fd + k]
mad.lo.u32 %r13, %r13, %r0, %r7;
mul.wide.u32 %rd7, %r13, 4;
add.u64 %rd7, %rd0, %rd7;
ld.global.f32 %f1, [%rd7];
add.f32 %f0, %f0, %f1;
add.u32 %r12, %r12, 1;
bra $LOOP;
$NORMALIZE:
// out[node*fd + k] = sum / degree
cvt.rn.f32.u32 %f2, %r11;
div.approx.f32 %f0, %f0, %f2;
$WRITE:
mul.wide.u32 %rd8, %r5, 4;
add.u64 %rd8, %rd3, %rd8;
st.global.f32 [%rd8], %f0;
$DONE:
ret;
}}
"#
)
}
pub fn gin_combine_ptx(sm: u32) -> String {
let hdr = ptx_header(sm);
format!(
r#"{hdr}.visible .entry gin_combine(
.param .u64 p_self,
.param .u64 p_aggr,
.param .u64 p_out,
.param .f32 eps_f32,
.param .u32 n,
.param .u32 feat_dim
)
{{
.reg .u64 %rd<8>;
.reg .u32 %r<10>;
.reg .f32 %f<6>;
.reg .pred %p0;
ld.param.u64 %rd0, [p_self];
ld.param.u64 %rd1, [p_aggr];
ld.param.u64 %rd2, [p_out];
ld.param.f32 %f0, [eps_f32];
ld.param.u32 %r0, [n];
ld.param.u32 %r1, [feat_dim];
// tid
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
mad.lo.u32 %r5, %r2, %r3, %r4;
mul.lo.u32 %r6, %r0, %r1; // n * feat_dim
setp.ge.u32 %p0, %r5, %r6;
@%p0 bra $DONE;
mul.wide.u32 %rd3, %r5, 4;
add.u64 %rd4, %rd0, %rd3;
ld.global.f32 %f1, [%rd4]; // self_feat[tid]
add.u64 %rd5, %rd1, %rd3;
ld.global.f32 %f2, [%rd5]; // aggr_feat[tid]
// out = (1 + eps) * self + aggr
mov.f32 %f3, 0F3F800000; // 1.0
add.f32 %f4, %f3, %f0; // 1 + eps
fma.rn.f32 %f5, %f4, %f1, %f2; // (1+eps)*self + aggr
add.u64 %rd6, %rd2, %rd3;
st.global.f32 [%rd6], %f5;
$DONE:
ret;
}}
"#
)
}
pub fn topk_score_ptx(sm: u32) -> String {
let hdr = ptx_header(sm);
let log2e = f32_hex(std::f32::consts::LOG2_E);
format!(
r#"{hdr}.visible .entry topk_score(
.param .u64 p_feat,
.param .u64 p_proj,
.param .u64 p_score,
.param .u32 feat_dim,
.param .u32 n_nodes
)
{{
.reg .u64 %rd<8>;
.reg .u32 %r<10>;
.reg .f32 %f<12>;
.reg .pred %p<4>;
ld.param.u64 %rd0, [p_feat];
ld.param.u64 %rd1, [p_proj];
ld.param.u64 %rd2, [p_score];
ld.param.u32 %r0, [feat_dim];
ld.param.u32 %r1, [n_nodes];
// tid = node_id
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
mad.lo.u32 %r5, %r2, %r3, %r4;
setp.ge.u32 %p0, %r5, %r1;
@%p0 bra $DONE;
// base offset = node_id * feat_dim
mul.lo.u32 %r6, %r5, %r0;
// Compute dot product and norm_sq simultaneously
mov.f32 %f0, 0F00000000; // dot = 0
mov.f32 %f1, 0F00000000; // norm_sq = 0
mov.u32 %r7, 0; // k = 0
$LOOP:
setp.ge.u32 %p1, %r7, %r0;
@%p1 bra $POSTLOOP;
add.u32 %r8, %r6, %r7;
mul.wide.u32 %rd3, %r8, 4;
add.u64 %rd3, %rd0, %rd3;
ld.global.f32 %f2, [%rd3]; // feat[node*fd + k]
mul.wide.u32 %rd4, %r7, 4;
add.u64 %rd4, %rd1, %rd4;
ld.global.f32 %f3, [%rd4]; // proj[k]
fma.rn.f32 %f0, %f2, %f3, %f0; // dot += feat * proj
fma.rn.f32 %f1, %f3, %f3, %f1; // norm_sq += proj^2
add.u32 %r7, %r7, 1;
bra $LOOP;
$POSTLOOP:
// norm = sqrt(norm_sq); safe divide by norm
sqrt.approx.f32 %f4, %f1;
// avoid div-by-zero: add tiny epsilon
mov.f32 %f5, 0F00800000; // 1e-38 (min normal f32)
add.f32 %f4, %f4, %f5;
div.approx.f32 %f6, %f0, %f4; // x = dot / norm
// tanh(x) = 2/(1 + exp(-2x)) - 1
// exp(-2x) using ex2: exp(-2x) = 2^(-2x * log2e)
mov.f32 %f7, {log2e};
mul.f32 %f8, %f6, %f7; // x * log2e
neg.f32 %f9, %f8;
add.f32 %f9, %f9, %f9; // -2 * x * log2e
ex2.approx.f32 %f9, %f9; // exp(-2x)
mov.f32 %f10, 0F3F800000; // 1.0
add.f32 %f9, %f10, %f9; // 1 + exp(-2x)
mov.f32 %f11, 0F40000000; // 2.0
div.approx.f32 %f9, %f11, %f9; // 2/(1+exp(-2x))
sub.f32 %f9, %f9, %f10; // tanh(x)
mul.wide.u32 %rd5, %r5, 4;
add.u64 %rd5, %rd2, %rd5;
st.global.f32 [%rd5], %f9;
$DONE:
ret;
}}
"#
)
}
#[cfg(test)]
mod tests {
use super::*;
const SM_VERSIONS: &[u32] = &[75, 80, 86, 90, 100, 120];
#[test]
fn ptx_header_sm80_contains_target() {
let h = ptx_header(80);
assert!(h.contains(".target sm_80"));
assert!(h.contains(".version 8.0"));
assert!(h.contains(".address_size 64"));
}
#[test]
fn ptx_header_sm90_contains_target() {
let h = ptx_header(90);
assert!(h.contains(".target sm_90"));
assert!(h.contains(".version 8.4"));
}
#[test]
fn ptx_header_sm120_contains_target() {
let h = ptx_header(120);
assert!(h.contains(".target sm_120"));
assert!(h.contains(".version 8.7"));
}
#[test]
fn f32_hex_one() {
assert_eq!(f32_hex(1.0_f32), "0F3F800000");
}
#[test]
fn f32_hex_zero() {
assert_eq!(f32_hex(0.0_f32), "0F00000000");
}
#[test]
fn f32_hex_negative() {
assert_eq!(f32_hex(-1.0_f32), "0FBF800000");
}
#[test]
fn e2e_ptx_kernels_all_sm_versions() {
for &sm in SM_VERSIONS {
let ptx = csr_spmv_ptx(sm);
assert!(ptx.contains("csr_spmv"), "spmv missing entry for sm={sm}");
assert!(ptx.contains(&format!("sm_{sm}")));
let ptx = scatter_add_ptx(sm);
assert!(
ptx.contains("scatter_add"),
"scatter_add missing entry for sm={sm}"
);
let ptx = gat_attention_ptx(sm);
assert!(
ptx.contains("gat_attention"),
"gat_attention missing entry for sm={sm}"
);
let ptx = softmax_edge_ptx(sm);
assert!(
ptx.contains("softmax_edge"),
"softmax_edge missing entry for sm={sm}"
);
let ptx = aggregate_mean_ptx(sm);
assert!(
ptx.contains("aggregate_mean"),
"aggregate_mean missing entry for sm={sm}"
);
let ptx = gin_combine_ptx(sm);
assert!(
ptx.contains("gin_combine"),
"gin_combine missing entry for sm={sm}"
);
let ptx = topk_score_ptx(sm);
assert!(
ptx.contains("topk_score"),
"topk_score missing entry for sm={sm}"
);
}
}
#[test]
fn csr_spmv_ptx_has_warp_reduction() {
let ptx = csr_spmv_ptx(80);
assert!(ptx.contains("shfl.sync.down.b32"));
}
#[test]
fn scatter_add_ptx_uses_atomic() {
let ptx = scatter_add_ptx(80);
assert!(ptx.contains("atom.global.add.f32"));
}
#[test]
fn gat_attention_ptx_has_leaky_relu() {
let ptx = gat_attention_ptx(80);
assert!(ptx.contains("setp.lt.f32"));
assert!(ptx.contains("selp.f32"));
}
#[test]
fn softmax_edge_ptx_has_exp_and_div() {
let ptx = softmax_edge_ptx(80);
assert!(ptx.contains("ex2.approx.f32"));
assert!(ptx.contains("div.approx.f32"));
}
#[test]
fn gin_combine_ptx_uses_fma() {
let ptx = gin_combine_ptx(80);
assert!(ptx.contains("fma.rn.f32"));
assert!(ptx.contains("3F800000"));
}
#[test]
fn topk_score_ptx_uses_tanh_approx() {
let ptx = topk_score_ptx(80);
assert!(ptx.contains("ex2.approx.f32"));
assert!(ptx.contains("sqrt.approx.f32"));
}
}