1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
//! Bitwise and select operations for KernelBuilder.
//!
//! Provides shift, AND, OR, XOR, and predicate-based select (selp) operations.
use super::super::instructions::{Operand, PtxInstruction, PtxOp};
use super::super::registers::VirtualReg;
use super::super::types::PtxType;
use super::control::PtxControl;
use super::KernelBuilder;
impl<'a> KernelBuilder<'a> {
/// Shift right u32 (logical shift)
///
/// NOTE: PTX requires .b32 (bitwise) type for shift ops, not .u32
pub fn shr_u32(&mut self, val: VirtualReg, shift: VirtualReg) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::U32);
self.instructions.push(
// PTX requires .b32 for shift ops, not .u32
PtxInstruction::new(PtxOp::Shr, PtxType::B32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(val))
.src(Operand::Reg(shift)),
);
dst
}
/// Shift right u32 by immediate (logical shift)
///
/// Uses an immediate value for the shift amount, avoiding register clobbering issues.
/// Use this in loops where the shift amount is constant to prevent SASS from
/// reusing the shift register.
pub fn shr_u32_imm(&mut self, val: VirtualReg, shift: u32) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::U32);
self.instructions.push(
PtxInstruction::new(PtxOp::Shr, PtxType::B32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(val))
.src(Operand::ImmU64(shift as u64)),
);
dst
}
/// Bitwise AND u32 (register AND register)
///
/// NOTE: PTX requires .b32 (bitwise) type for and/or/xor, not .u32
pub fn and_u32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::U32);
self.instructions.push(
// PTX requires .b32 for bitwise ops, not .u32
PtxInstruction::new(PtxOp::And, PtxType::B32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(a))
.src(Operand::Reg(b)),
);
dst
}
/// Bitwise OR u32 (register OR register)
///
/// NOTE: PTX requires .b32 (bitwise) type for and/or/xor, not .u32
pub fn or_u32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::U32);
self.instructions.push(
// PTX requires .b32 for bitwise ops, not .u32
PtxInstruction::new(PtxOp::Or, PtxType::B32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(a))
.src(Operand::Reg(b)),
);
dst
}
/// Bitwise OR u32 into existing register (register reuse)
pub fn or_u32_into(&mut self, dst: VirtualReg, a: VirtualReg, b: VirtualReg) {
self.instructions.push(
PtxInstruction::new(PtxOp::Or, PtxType::B32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(a))
.src(Operand::Reg(b)),
);
}
/// Shift left u32 (register << register)
///
/// NOTE: PTX requires .b32 (bitwise) type for shift ops, not .u32
pub fn shl_u32(&mut self, val: VirtualReg, shift: VirtualReg) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::U32);
self.instructions.push(
// PTX requires .b32 for shift ops, not .u32
PtxInstruction::new(PtxOp::Shl, PtxType::B32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(val))
.src(Operand::Reg(shift)),
);
dst
}
/// Shift left u32 by immediate (register << immediate)
pub fn shl_u32_imm(&mut self, val: VirtualReg, shift: u32) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::U32);
self.instructions.push(
PtxInstruction::new(PtxOp::Shl, PtxType::B32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(val))
.src(Operand::ImmU64(shift as u64)),
);
dst
}
/// Select based on predicate: dst = pred ? true_val : false_val
///
/// PTX format: selp.u32 d, a, b, p
/// where d = destination, a = value if true, b = value if false, p = predicate
pub fn selp_u32(
&mut self,
pred: VirtualReg,
true_val: VirtualReg,
false_val: VirtualReg,
) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::U32);
self.instructions.push(
PtxInstruction::new(PtxOp::Selp, PtxType::U32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(true_val))
.src(Operand::Reg(false_val))
.src(Operand::Reg(pred)),
);
dst
}
/// Select f32 based on predicate: dst = pred ? true_val : false_val
///
/// PTX format: selp.f32 d, a, b, p
/// PAR-062: Used by ArgMax kernel for conditional max tracking
pub fn selp_f32(
&mut self,
pred: VirtualReg,
true_val: VirtualReg,
false_val: VirtualReg,
) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::F32);
self.instructions.push(
PtxInstruction::new(PtxOp::Selp, PtxType::F32)
.dst(Operand::Reg(dst))
.src(Operand::Reg(true_val))
.src(Operand::Reg(false_val))
.src(Operand::Reg(pred)),
);
dst
}
// setp_gt_f32 is provided by PtxComparison trait (comparison.rs)
/// AND two predicates: dst = a AND b
/// Used for combining bounds checks (PARITY-114)
pub fn and_pred(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
let dst = self.registers.allocate_virtual(PtxType::Pred);
self.instructions.push(
PtxInstruction::new(PtxOp::And, PtxType::Pred)
.dst(Operand::Reg(dst))
.src(Operand::Reg(a))
.src(Operand::Reg(b)),
);
dst
}
/// Get shared memory base pointer
///
/// PAR-062: Returns base address of shared memory for this block
pub fn shared_ptr(&mut self) -> VirtualReg {
self.shared_base_addr()
}
/// Bitwise AND u32 with immediate
///
/// PAR-062: Used for lane_id extraction (tid & 31)
pub fn and_u32_imm(&mut self, a: VirtualReg, imm: u32) -> VirtualReg {
let imm_reg = self.mov_u32_imm(imm);
self.and_u32(a, imm_reg)
}
}