ptx-90-parser 0.4.4

Parse NVIDIA PTX 9.0 assembly into a structured AST and explore modules via a CLI.
Documentation
mod util;

use ptx_parser::run_with_large_stack;
use ptx_parser::{PtxUnparser, r#type::Instruction};
use util::{parse_result, tokenize_only};

const DOC_EXAMPLE_INSTRUCTIONS: &[&str] = &[
    "abs.s32  r0,a;",
    "activemask.b32  %r1;",
    "@p  add.u32     x,y,z;",
    "add.sat.s32 c,c,1;",
    "add.u16x2   u,v,w;",
    "@p  addc.cc.u32  x3,y3,z3;",
    "@p  addc.u32     x4,y4,z4;",
    "and.b32  x,q,r;",
    "and.b32  sign,fpvalue,0x80000000;",
    "applypriority.global.L2::evict_normal [ptr], 128;",
    "atom.global.add.s32  d,[a],1;",
    "@p  atom.global.cas.b32  d,[p],my_val,my_new_val;",
    "atom.add.noftz.f16x2 d, [a], b;",
    "atom.add.noftz.f16   hd, [ha], hb;",
    "atom.global.cas.b16  hd, [ha], hb, hc;",
    "atom.add.noftz.bf16   hd, [a], hb;",
    "atom.add.noftz.bf16x2 bd, [b], bb;",
    "createpolicy.fractional.L2::evict_last.b64 cache_policy, 0.25;",
    "atom.global.add.L2::cache_hint.s32  d, [a], 1, cache_policy;",
    "bfi.b32  d,a,b,start,len;",
    "bfind.u32  d, a;",
    "bmsk.clamp.b32  rd, ra, rb;",
    "brev.b32  d, a;",
    "brkpt;",
    "@p  brkpt;",
    "@p brx.idx %r0, ts;",
    "@p    ld.global.u32  %r0, [jmptbl+4];",
    "@p    ld.global.u32  %r0, [jmptbl+8];",
    "call  (retval), %r0, (x, y), jmptbl;",
    "@p    mov.u32  %r0, foo;",
    "@q    mov.u32  %r0, baz;",
    "call  (retval), %r0, (x, y), Ftgt;",
    "call  %fptr, (x, y), Fproto;",
    "mov.b32 xctaid, %ctaid.x;",
    "barrier.cluster.arrive.relaxed;",
    "barrier.cluster.wait.acquire;",
    "mov.u32  %r0, %tid.x;",
    "@!p0 bra asyncWork;",
    "mov.u32  %r0, %cluster_ctaid.x;",
    "@p0 clusterlaunchcontrol.try_cancel.async.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], [mbar];",
    "@!complete bra waitLoop;",
    "ld.shared.b128 handle, [addr];",
    "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 p, handle;",
    "@p clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 {xctaid, _, _, _},  handle;",
    "fence.proxy.async.shared::cta;",
    "bra processCluster;",
    "clz.b32  d, a;",
    "cnot.b32 d,a;",
    "copysign.f32  x, y, z;",
    "copysign.f64  A, B, C;",
    "cos.approx.ftz.f32  ca, a;",
    "cp.async.ca.shared.global  [shrd],    [gbl + 4], 4;",
    "cp.async.cg.shared.global  [%r2],     [%r3],     16;",
    "cp.async.cg.shared.global.L2::64B   [%r2],      [%r3],     16;",
    "cp.async.cg.shared.global.L2::128B  [%r0 + 16], [%r1],     16;",
    "cp.async.cg.shared.global.L2::256B  [%r2 + 32], [%r3],     16;",
    "createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache_policy, 0.25;",
    "cp.async.ca.shared.global.L2::cache_hint [%r2], [%r1], 4, cache_policy;",
    "cp.async.ca.shared.global                   [shrd], [gbl], 4, p;",
    "cp.async.cg.shared.global.L2::cache_hint   [%r0], [%r2], 16, q, cache_policy;",
    "createpolicy.fractional.L2::evict_last.b64                      policy, 1.0;",
    "createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64  policy, 0.5;",
    "createpolicy.range.L2::evict_last.L2::evict_first.b64 policy, [ptr], 0x100000, 0x200000;",
    "createpolicy.cvt.L2.b64 policy, access_prop;",
    "cvt.f32.s32 f,i;",
    "cvta.const.u32   ptr,cvar;",
    "cvta.local.u32   ptr,lptr;",
    "cvta.to.global.u32  p,gptr;",
    "cvta.param.u64   ptr,pvar;",
    "discard.global.L2 [ptr], 128;",
    "ld.weak.u32 r0, [ptr];",
    "ld.weak.u32 r1, [ptr];",
    "div.s32  b,n,i;",
    "dp2a.lo.u32.u32           d0, a0, b0, c0;",
    "dp2a.hi.u32.s32           d1, a1, b1, c1;",
    "dp4a.u32.u32           d0, a0, b0, c0;",
    "dp4a.u32.s32           d1, a1, b1, c1;",
    "elect.sync    %r0|%p0, 0xffffffff;",
    "ex2.approx.ftz.f32  xa, a;",
    "exit;",
    "@p  exit;",
    "membar.gl;",
    "membar.cta;",
    "membar.sys;",
    "fence.sc.cta;",
    "fence.sc.cluster;",
    "fence.proxy.alias;",
    "membar.proxy.alias;",
    "fence.mbarrier_init.release.cluster;",
    "fence.proxy.async;",
    "fence.proxy.async.shared::cluster;",
    "fence.proxy.async.global;",
    "tensormap.replace.tile.global_address.global.b1024.b64   [gbl], new_addr;",
    "fence.proxy.tensormap::generic.release.gpu;",
    "cvta.global.u64  tmap, gbl;",
    "fence.proxy.tensormap::generic.acquire.gpu [tmap], 128;",
    "fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster;",
    "fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster;",
    "fence.acquire.sync_restrict::shared::cluster.cluster;",
    "fence.release.sync_restrict::shared::cta.cluster;",
    "mbarrier.arrive.relaxed.cluster.shared::cluster.b64 state, [bar];",
    "fma.rn.ftz.f32  w,x,y,z;",
    "@p  fma.rn.f64      d,a,b,c;",
    "fma.rp.ftz.f32x2 p,q,r,s;",
    "getctarank.shared::cluster.u32 d1, addr;",
    "getctarank.shared::cluster.u64 d2, sh + 4;",
    "getctarank.u64                 d3, src;",
    "griddepcontrol.launch_dependents;",
    "griddepcontrol.wait;",
    "isspacep.const           iscnst, cptr;",
    "isspacep.global          isglbl, gptr;",
    "isspacep.local           islcl,  lptr;",
    "isspacep.shared          isshrd, sptr;",
    "istypep.texref istex, tptr;",
    "istypep.samplerref issampler, sptr;",
    "istypep.surfref issurface, surfptr;",
    "ld.global.f32    d,[a];",
    "ld.shared.v4.b32 Q,[p];",
    "ld.const.s32     d,[p+4];",
    "ld.global.f32    d,[ugbl].unified;",
    "ld.b32           %r0, [%r1].unified;",
    "ld.global.L1::evict_last.u32  d, [p];",
    "createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache_policy, 1;",
    "ld.global.L2::cache_hint.b64  x, [p], cache_policy;",
    "ld.param::entry.b32 %rp1, [kparam1];",
    "ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {d0, d1}, [addr];",
    "ldmatrix.sync.aligned.m8n8.x4.b16 {d0, d1, d2, d3}, [addr];",
    "ldmatrix.sync.aligned.m16n16.x1.trans.shared.b8 {d0, d1}, [addr];",
    "ldu.global.f32    d,[a];",
    "ldu.global.b32    d,[p+4];",
    "ldu.global.v4.f32 Q,[p];",
    "ldu.global.b128   d,[a];",
    "lg2.approx.ftz.f32  la, a;",
    "lop3.b32       d, a, b, c, 0x40;",
    "lop3.or.b32  d|p, a, b, c, 0x3f, q;",
    "lop3.and.b32 _|p, a, b, c, 0x3f, q;",
    "@p  mad.lo.s32 d,a,b,c;",
    "mad.lo.s32 r,p,q,r;",
    "mapa.shared::cluster.u64 d1, %reg1, cta;",
    "mapa.shared::cluster.u32 d2, sh, 3;",
    "mapa.u64                 d3, %reg2, cta;",
    "match.any.sync.b32    d, a, 0xffffffff;",
    "match.all.sync.b64    d|p, a, mask;",
    "max.u32  d,a,b;",
    "max.s32  q,q,0;",
    "max.relu.s16x2 t,t,u;",
    "cvta.shared.u64          addr, shMem2;",
    "mbarrier.init.b64        [addr],   %r1;",
    "bar.cta.sync             0;",
    "bar.sync                 0;",
    "min.s32  r0,a,b;",
    "@p  min.u16  h,i,j;",
    "min.relu.s16x2 u,v,w;",
    "mov.f32  d,a;",
    "mov.u16  u,v;",
    "mov.f32  k,0.1;",
    "movmatrix.sync.aligned.m8n8.trans.b16 d, a;",
    "multimem.ld_reduce.and.b32                    val1_b32, [addr1];",
    "multimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];",
    "multimem.st.relaxed.gpu.b32                [addr3], val3_b32;",
    "multimem.st.release.cta.global.u32         [addr4], val4_u32;",
    "nanosleep.u32 r;",
    "nanosleep.u32 42;",
    "@p nanosleep.u32 r;",
    "neg.s32  r0,a;",
    "not.b32  mask,mask;",
    "not.pred  p,q;",
    "or.pred  p,q,r;",
    "pmevent      1;",
    "@p  pmevent      7;",
    "@q  pmevent.mask 0xff;",
    "popc.b32  d, a;",
    "prefetch.global.L1             [ptr];",
    "prefetch.global.L2::evict_last [ptr];",
    "prefetchu.L1  [addr];",
    "prefetch.const.tensormap       [ptr];",
    "prmt.b32      r1, r2, r3, r4;",
    "prmt.b32.f4e  r1, r2, r3, r4;",
    "rcp.approx.ftz.f32  ri,r;",
    "rcp.rn.ftz.f32      xi,x;",
    "rcp.rn.f64          xi,x;",
    "red.add.noftz.f16x2 [a], b;",
    "red.add.noftz.bf16   [a], hb;",
    "red.add.noftz.bf16x2 [b], bb;",
    "redux.sync.add.s32 dst, src, 0xff;",
    "redux.sync.xor.b32 dst, src, mask;",
    "redux.sync.min.abs.NaN.f32 dst, src, mask;",
    "rem.s32  x,x,8;",
    "ret;",
    "@p  ret;",
    "rsqrt.approx.ftz.f32  isr, x;",
    "rsqrt.approx.f64      ISR, X;",
    "sad.s32  d,a,b,c;",
    "selp.s32  r0,r,g,p;",
    "@q  selp.f32  f0,t,x,xp;",
    "@p  set.lt.and.f32.s32  d,a,b,r;",
    "set.eq.u32.u32      d,i,n;",
    "setmaxnreg.dec.sync.aligned.u32 64;",
    "setmaxnreg.inc.sync.aligned.u32 192;",
    "setp.lt.and.s32  p|q,a,b,r;",
    "@q  setp.eq.u32      p,i,n;",
    "shfl.up.b32  Ry|p, Rx, 0x1,  0x0;",
    "@p  add.f32      Rx, Ry, Rx;",
    "shfl.up.b32  Ry|p, Rx, 0x2,  0x0;",
    "shfl.up.b32  Ry|p, Rx, 0x4,  0x0;",
    "shfl.up.b32  Ry|p, Rx, 0x8,  0x0;",
    "shfl.up.b32  Ry|p, Rx, 0x10, 0x0;",
    "shfl.down.b32  Ry|p, Rx, 0x1,  0x1f;",
    "@p  add.f32        Rx, Ry, Rx;",
    "shfl.down.b32  Ry|p, Rx, 0x2,  0x1f;",
    "shfl.down.b32  Ry|p, Rx, 0x4,  0x1f;",
    "shfl.down.b32  Ry|p, Rx, 0x8,  0x1f;",
    "shfl.down.b32  Ry|p, Rx, 0x10, 0x1f;",
    "add.f32        Rx, Ry, Rx;",
    "shfl.bfly.b32  Ry, Rx, 0x8,  0x1f;",
    "shfl.bfly.b32  Ry, Rx, 0x4,  0x1f;",
    "shfl.bfly.b32  Ry, Rx, 0x2,  0x1f;",
    "shfl.bfly.b32  Ry, Rx, 0x1,  0x1f;",
    "sin.approx.ftz.f32  sa, a;",
    "slct.u32.s32  x, y, z, val;",
    "slct.ftz.u64.f32  A, B, C, fval;",
    "sqrt.approx.ftz.f32  r,x;",
    "sqrt.rn.ftz.f32      r,x;",
    "sqrt.rn.f64          r,x;",
    "st.global.f32    [a],b;",
    "st.local.b32     [q+4],a;",
    "st.global.v4.s32 [p],Q;",
    "st.global.L1::no_allocate.f32 [p], a;",
    "st.global.L2::cache_hint.b32  [a], b, cache_policy;",
    "stacksave.u32 ra;",
    "stackrestore.u32 ra;",
    "stacksave.u32 rd;",
    "stacksave.u64 rd1;",
    "stmatrix.sync.aligned.m8n8.x1.shared.b16 [addr], {r};",
    "stmatrix.sync.aligned.m8n8.x4.b16 [addr], {r0, r1, r2, r3};",
    "stmatrix.sync.aligned.m16n8.x1.trans.shared.b8 [addr], {r};",
    "stmatrix.sync.aligned.m16n8.x4.b8 [addr], {r0, r1, r2, r3};",
    "sub.s32 c,a,b;",
    "@p  subc.cc.u32  x3,y3,z3;",
    "@p  subc.u32     x4,y4,z4;",
    "suld.b.1d.v4.b32.trap  {s1,s2,s3,s4}, [surf_B, {x}];",
    "suld.b.3d.v2.b64.trap  {r1,r2}, [surf_A, {x,y,z,w}];",
    "suld.b.a1d.v2.b32      {r0,r1}, [surf_C, {idx,x}];",
    "suq.width.b32       %r1, [surf_A];",
    "sured.b.add.2d.u32.trap  [surf_A, {x,y}], r1;",
    "sured.p.min.1d.b32.trap  [surf_B, {x}], r1;",
    "sured.b.max.1d.u64.trap  [surf_C, {x}], r1;",
    "sured.p.min.1d.b64.trap  [surf_D, {x}], r1;",
    "sust.p.1d.v4.b32.trap  [surf_B, {x}], {f1,f2,f3,f4};",
    "sust.b.3d.v2.b64.trap  [surf_A, {x,y,z,w}], {r1,r2};",
    "sust.b.a1d.v2.b64      [surf_C, {idx,x}], {r1,r2};",
    "szext.clamp.s32 rd, ra, rb;",
    "tanh.approx.f32 ta, a;",
    "tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [sMemAddr1], 32;",
    "ld.shared.b32 taddr, [sMemAddr1];",
    "tcgen05.dealloc.cta_group::1.sync.aligned.b32  taddr, 32;",
    "tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;",
    "tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [sMemAddr2], 32;",
    "ld.shared.b32 taddr, [sMemAddr2];",
    "tcgen05.dealloc.cta_group::2.sync.aligned.b32  taddr, 32;",
    "tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;",
    "tensormap.replace.tile.global_address.shared::cta.b1024.b64   [sMem], new_val;",
    "testp.notanumber.f32  isnan, f0;",
    "testp.infinite.f64    p, X;",
    "tex.3d.v4.s32.s32  {r1,r2,r3,r4}, [tex_a,{f1,f2,f3,f4}];",
    "tex.1d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,smpl_x,{f1}];",
    "tex.a1d.v4.s32.s32 {r1,r2,r3,r4}, [tex_a,smpl_x,{idx,s1}];",
    "tex.a2d.v4.s32.f32 {r1,r2,r3,r4}, [tex_a,{idx,f1,f2,f3}];",
    "tex.acube.v4.f32.f32 {r0,r1,r2,r3}, [tex_cuarray,{idx,f1,f2,f3}];",
    "tex.2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ms,{sample,r6,r7,r8}];",
    "tex.2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ms, smpl_x,{sample,r6,r7,r8}];",
    "tex.a2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ams,{idx,sample,r6,r7}];",
    "tex.1d.v4.f16.f32  {h1,h2,h3,h4}, [tex_a,smpl_x,{f1}];",
    "tex.1d.v2.f16x2.f32  {h1,h2}, [tex_a,smpl_x,{f1}];",
    "tex.1d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a, {f1}], {r5};",
    "tex.a2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{idx,f1,f2}], {f5,f6};",
    "tex.level.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}], flvl, {r7, r8};",
    "tex.1d.v4.f32.f32  {f1,f2,f3,f4}, [tex_a, {f1}], f0;",
    "tex.a2d.v4.s32.f32  {f0,f1,f2,f3}, [tex_a,{idx,f4,f5}], {r5,r6}, f6;",
    "tex.3d.v4.s32.s32 {r1,r2,r3,r4}|p, [tex_a,{f1,f2,f3,f4}];",
    "tld4.r.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}];",
    "tld4.r.2d.v4.u32.f32  {u1,u2,u3,u4}, [tex_a,smpl_x,{f1,f2}];",
    "tld4.r.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}], {r5, r6};",
    "tld4.r.2d.v4.f32.f32  {f1,f2,f3,f4}, [tex_a,{f5,f6}], f7;",
    "tld4.r.2d.v4.f32.f32 {f1,f2,f3,f4}|p, [tex_a,{f5,f6}], f7;",
    "trap;",
    "@p  trap;",
    "txq.width.b32       %r1, [tex_A];",
    "txq.level.width.b32 %r1, [tex_A], %r_lod;",
    "vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;",
    "vsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;",
    "vabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;",
    "vmin.s32.s32.s32.sat.add  r1, r2, r3, c;",
    "vadd2.s32.s32.u32.sat  r1, r2, r3, r1;",
    "vsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;",
    "vmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;",
    "vadd4.s32.s32.u32.sat  r1, r2, r3, r1;",
    "vsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;",
    "vmin4.s32.u32.u32.add  r1.b10, r2.b0000, r3.b2222, r1;",
    "vmad.s32.s32.u32.sat    r0, r1, r2, -r3;",
    "vmad.u32.u32.u32.shr15  r0, r1.h0, r2.h0, r3;",
    "vote.all.pred    p,q;",
    "vote.uni.pred    p,q;",
    "vset.s32.u32.lt    r1, r2, r3;",
    "vset.u32.u32.ne    r1, r2, r3.h1;",
    "vset2.s32.u32.lt      r1, r2, r3, r0;",
    "vset2.u32.u32.ne.add  r1, r2, r3, r0;",
    "vset4.s32.u32.lt      r1, r2, r3, r0;",
    "vshl.s32.u32.u32.clamp  r1, r2, r3;",
    "vshr.u32.u32.u32.wrap   r1, r2, r3.h1;",
    "wmma.load.b.sync.aligned.row.m16n16k16.f16 {x0,x1,x2,x3,x4,x5,x,x7}, [ptr];",
    "mul.f32 x0, x0, 0.1;",
    "mul.f32 x7, x7, 0.1;",
    "wmma.load.a.sync.aligned.row.m32n8k16.u8 {x0,x1,x2,x3}, [ptr];",
    "wmma.load.a.sync.aligned.row.m8n8k32.s4 {x0}, [ptr];",
    "xor.b32  d,q,r;",
    "xor.b16  d,x,0x0001;",
];

#[test]
fn docs_examples_roundtrip() {
    run_with_large_stack(|| inner_docs_examples_roundtrip());
}

fn inner_docs_examples_roundtrip() {
    let mut parse_failures = Vec::new();
    let mut mismatches = Vec::new();
    let mut matched = 0usize;
    for example in DOC_EXAMPLE_INSTRUCTIONS {
        match parse_result::<Instruction>(example) {
            Ok(parsed) => {
                let original = tokenize_only(example);
                let unparsed = parsed.to_tokens();
                if util::tokens_equivalent(&unparsed, &original) {
                    matched += 1;
                } else {
                    mismatches.push(*example);
                }
            }
            Err(_) => parse_failures.push(*example),
        }
    }
    eprintln!(
        "Doc examples roundtrip summary: {} matched, {} parse failures, {} mismatches",
        matched,
        parse_failures.len(),
        mismatches.len()
    );
    for example in &parse_failures {
        eprintln!("  [parse] {}", example);
    }
    for example in &mismatches {
        eprintln!("  [mismatch] {}", example);
    }
    assert!(
        mismatches.len() == 0,
        "no documentation examples round-tripped"
    );
    assert!(
        parse_failures.is_empty(),
        "no documentation examples failed to parse"
    );
}