Skip to main content

oxicuda_ptx/builder/body_builder/
body_builder_ext.rs

1//! Extension methods for [`BodyBuilder`] — atomic/reduce operations,
2//! texture/surface instructions, warp-level primitives, and barrier/
3//! synchronization instructions.
4//!
5//! Split from `body_builder.rs` to respect the 2000-line policy.
6//! Declared as a child module of `body_builder` so it can access
7//! `pub(super)` fields directly.
8
9use crate::arch::SmVersion;
10use crate::error::PtxGenError;
11use crate::ir::{
12    AtomOp, FenceScope, GridDepAction, Instruction, MemorySpace, Operand, PtxType, ReduxOp,
13    Register, RoundingMode, SetmaxnregAction, StmatrixShape,
14};
15
16use super::BodyBuilder;
17
18impl BodyBuilder<'_> {
19    // ════════════════════════════════════════════════════════════════════
20    //  Atomic Operations
21    // ════════════════════════════════════════════════════════════════════
22
23    /// Atomic add on global memory (f32): returns old value at `[addr]`, stores `old + val`.
24    pub fn atom_global_add_f32(&mut self, addr: Register, val: Register) -> Register {
25        self.atom_typed(MemorySpace::Global, AtomOp::Add, PtxType::F32, addr, val)
26    }
27
28    /// Atomic add on global memory (u32): returns old value at `[addr]`, stores `old + val`.
29    pub fn atom_global_add_u32(&mut self, addr: Register, val: Register) -> Register {
30        self.atom_typed(MemorySpace::Global, AtomOp::Add, PtxType::U32, addr, val)
31    }
32
33    /// Atomic add on global memory (u64): returns old value at `[addr]`, stores `old + val`.
34    pub fn atom_global_add_u64(&mut self, addr: Register, val: Register) -> Register {
35        self.atom_typed(MemorySpace::Global, AtomOp::Add, PtxType::U64, addr, val)
36    }
37
38    /// Atomic add on global memory (f64): returns old value at `[addr]`, stores `old + val`.
39    pub fn atom_global_add_f64(&mut self, addr: Register, val: Register) -> Register {
40        self.atom_typed(MemorySpace::Global, AtomOp::Add, PtxType::F64, addr, val)
41    }
42
43    /// Atomic compare-and-swap on global memory (u32).
44    ///
45    /// If `[addr] == compare`, stores `value`. Returns the old value at `[addr]`.
46    pub fn atom_global_cas_u32(
47        &mut self,
48        addr: Register,
49        compare: Register,
50        value: Register,
51    ) -> Register {
52        self.atom_cas_typed(MemorySpace::Global, PtxType::U32, addr, compare, value)
53    }
54
55    /// Atomic compare-and-swap on global memory (u64).
56    ///
57    /// If `[addr] == compare`, stores `value`. Returns the old value at `[addr]`.
58    pub fn atom_global_cas_u64(
59        &mut self,
60        addr: Register,
61        compare: Register,
62        value: Register,
63    ) -> Register {
64        self.atom_cas_typed(MemorySpace::Global, PtxType::U64, addr, compare, value)
65    }
66
67    /// Atomic exchange on global memory (u32): stores `val`, returns old value.
68    pub fn atom_global_exch_u32(&mut self, addr: Register, val: Register) -> Register {
69        self.atom_typed(MemorySpace::Global, AtomOp::Exch, PtxType::U32, addr, val)
70    }
71
72    /// Atomic min on global memory (u32).
73    pub fn atom_global_min_u32(&mut self, addr: Register, val: Register) -> Register {
74        self.atom_typed(MemorySpace::Global, AtomOp::Min, PtxType::U32, addr, val)
75    }
76
77    /// Atomic max on global memory (u32).
78    pub fn atom_global_max_u32(&mut self, addr: Register, val: Register) -> Register {
79        self.atom_typed(MemorySpace::Global, AtomOp::Max, PtxType::U32, addr, val)
80    }
81
82    /// Atomic min on global memory (s32).
83    pub fn atom_global_min_s32(&mut self, addr: Register, val: Register) -> Register {
84        self.atom_typed(MemorySpace::Global, AtomOp::Min, PtxType::S32, addr, val)
85    }
86
87    /// Atomic max on global memory (s32).
88    pub fn atom_global_max_s32(&mut self, addr: Register, val: Register) -> Register {
89        self.atom_typed(MemorySpace::Global, AtomOp::Max, PtxType::S32, addr, val)
90    }
91
92    /// Atomic bitwise AND on global memory (b32).
93    pub fn atom_global_and_b32(&mut self, addr: Register, val: Register) -> Register {
94        self.atom_typed(MemorySpace::Global, AtomOp::And, PtxType::B32, addr, val)
95    }
96
97    /// Atomic bitwise OR on global memory (b32).
98    pub fn atom_global_or_b32(&mut self, addr: Register, val: Register) -> Register {
99        self.atom_typed(MemorySpace::Global, AtomOp::Or, PtxType::B32, addr, val)
100    }
101
102    /// Atomic bitwise XOR on global memory (b32).
103    pub fn atom_global_xor_b32(&mut self, addr: Register, val: Register) -> Register {
104        self.atom_typed(MemorySpace::Global, AtomOp::Xor, PtxType::B32, addr, val)
105    }
106
107    /// Atomic add on shared memory (f32): critical for block-level reductions.
108    pub fn atom_shared_add_f32(&mut self, addr: Register, val: Register) -> Register {
109        self.atom_typed(MemorySpace::Shared, AtomOp::Add, PtxType::F32, addr, val)
110    }
111
112    /// Atomic add on shared memory (u32).
113    pub fn atom_shared_add_u32(&mut self, addr: Register, val: Register) -> Register {
114        self.atom_typed(MemorySpace::Shared, AtomOp::Add, PtxType::U32, addr, val)
115    }
116
117    /// Atomic reduction (fire-and-forget) add on global memory (f32).
118    ///
119    /// Unlike `atom`, this does not return the old value and may be faster.
120    pub fn red_global_add_f32(&mut self, addr: Register, val: Register) {
121        self.red_typed(MemorySpace::Global, AtomOp::Add, PtxType::F32, addr, val);
122    }
123
124    /// Atomic reduction (fire-and-forget) add on global memory (u32).
125    pub fn red_global_add_u32(&mut self, addr: Register, val: Register) {
126        self.red_typed(MemorySpace::Global, AtomOp::Add, PtxType::U32, addr, val);
127    }
128
129    /// Generic atomic operation helper.
130    fn atom_typed(
131        &mut self,
132        space: MemorySpace,
133        op: AtomOp,
134        ty: PtxType,
135        addr: Register,
136        src: Register,
137    ) -> Register {
138        let dst = self.regs.alloc(ty);
139        self.instructions.push(Instruction::Atom {
140            space,
141            op,
142            ty,
143            dst: dst.clone(),
144            addr: Operand::Register(addr),
145            src: Operand::Register(src),
146        });
147        dst
148    }
149
150    /// Generic atomic compare-and-swap helper.
151    fn atom_cas_typed(
152        &mut self,
153        space: MemorySpace,
154        ty: PtxType,
155        addr: Register,
156        compare: Register,
157        value: Register,
158    ) -> Register {
159        let dst = self.regs.alloc(ty);
160        self.instructions.push(Instruction::AtomCas {
161            space,
162            ty,
163            dst: dst.clone(),
164            addr: Operand::Register(addr),
165            compare: Operand::Register(compare),
166            value: Operand::Register(value),
167        });
168        dst
169    }
170
171    /// Generic atomic reduction helper (no return value).
172    fn red_typed(
173        &mut self,
174        space: MemorySpace,
175        op: AtomOp,
176        ty: PtxType,
177        addr: Register,
178        src: Register,
179    ) {
180        self.instructions.push(Instruction::Red {
181            space,
182            op,
183            ty,
184            addr: Operand::Register(addr),
185            src: Operand::Register(src),
186        });
187    }
188
189    // ════════════════════════════════════════════════════════════════════
190    //  Texture / Surface Operations
191    // ════════════════════════════════════════════════════════════════════
192
193    /// Emits a 1D texture fetch instruction.
194    ///
195    /// Fetches a texel from the named texture reference at the given
196    /// integer coordinate. Returns the destination register.
197    ///
198    /// Emits: `tex.1d.v4.{ty}.s32 dst, [tex_ref, {coord}];`
199    pub fn tex_1d(&mut self, ty: PtxType, tex_ref: &str, coord: Operand) -> Register {
200        let dst = self.regs.alloc(ty);
201        self.instructions.push(Instruction::Tex1d {
202            ty,
203            dst: dst.clone(),
204            tex_ref: tex_ref.to_string(),
205            coord,
206        });
207        dst
208    }
209
210    /// Emits a 2D texture fetch instruction.
211    ///
212    /// Fetches a texel from the named texture reference at the given
213    /// (x, y) integer coordinates. Returns the destination register.
214    ///
215    /// Emits: `tex.2d.v4.{ty}.s32 dst, [tex_ref, {coord_x, coord_y}];`
216    pub fn tex_2d(
217        &mut self,
218        ty: PtxType,
219        tex_ref: &str,
220        coord_x: Operand,
221        coord_y: Operand,
222    ) -> Register {
223        let dst = self.regs.alloc(ty);
224        self.instructions.push(Instruction::Tex2d {
225            ty,
226            dst: dst.clone(),
227            tex_ref: tex_ref.to_string(),
228            coord_x,
229            coord_y,
230        });
231        dst
232    }
233
234    /// Emits a 3D texture fetch instruction.
235    ///
236    /// Fetches a texel from the named texture reference at the given
237    /// (x, y, z) integer coordinates. Returns the destination register.
238    ///
239    /// Emits: `tex.3d.v4.{ty}.s32 dst, [tex_ref, {coord_x, coord_y, coord_z}];`
240    pub fn tex_3d(
241        &mut self,
242        ty: PtxType,
243        tex_ref: &str,
244        coord_x: Operand,
245        coord_y: Operand,
246        coord_z: Operand,
247    ) -> Register {
248        let dst = self.regs.alloc(ty);
249        self.instructions.push(Instruction::Tex3d {
250            ty,
251            dst: dst.clone(),
252            tex_ref: tex_ref.to_string(),
253            coord_x,
254            coord_y,
255            coord_z,
256        });
257        dst
258    }
259
260    /// Emits a 1D surface load instruction.
261    ///
262    /// Loads a value from the named surface reference at the given
263    /// coordinate. Returns the destination register.
264    ///
265    /// Emits: `suld.b.1d.{ty} dst, [surf_ref, {coord}];`
266    pub fn surf_load(&mut self, ty: PtxType, surf_ref: &str, coord: Operand) -> Register {
267        let dst = self.regs.alloc(ty);
268        self.instructions.push(Instruction::SurfLoad {
269            ty,
270            dst: dst.clone(),
271            surf_ref: surf_ref.to_string(),
272            coord,
273        });
274        dst
275    }
276
277    /// Emits a 1D surface store instruction.
278    ///
279    /// Stores a value to the named surface reference at the given coordinate.
280    ///
281    /// Emits: `sust.b.1d.{ty} [surf_ref, {coord}], src;`
282    pub fn surf_store(&mut self, ty: PtxType, surf_ref: &str, coord: Operand, src: Register) {
283        self.instructions.push(Instruction::SurfStore {
284            ty,
285            surf_ref: surf_ref.to_string(),
286            coord,
287            src,
288        });
289    }
290
291    // ════════════════════════════════════════════════════════════════════
292    //  Warp-level Primitives (SM >= 80/90)
293    // ════════════════════════════════════════════════════════════════════
294
295    /// Warp-level sum reduction on a `u32` value (SM >= 80).
296    pub fn redux_add_u32(&mut self, src: &str) -> Result<String, PtxGenError> {
297        self.redux_op(ReduxOp::Add, src)
298    }
299
300    /// Warp-level max reduction on a `u32` value (SM >= 80).
301    pub fn redux_max_u32(&mut self, src: &str) -> Result<String, PtxGenError> {
302        self.redux_op(ReduxOp::Max, src)
303    }
304
305    /// Warp-level min reduction on a `u32` value (SM >= 80).
306    pub fn redux_min_u32(&mut self, src: &str) -> Result<String, PtxGenError> {
307        self.redux_op(ReduxOp::Min, src)
308    }
309
310    fn redux_op(&mut self, op: ReduxOp, src: &str) -> Result<String, PtxGenError> {
311        if !self.target.capabilities().has_redux {
312            return Err(PtxGenError::GenerationFailed(format!(
313                "redux.sync requires SM >= 80, target is {}",
314                self.target
315            )));
316        }
317        let dst = self.regs.alloc(PtxType::U32);
318        let name = dst.name.clone();
319        self.instructions.push(Instruction::Redux {
320            op,
321            dst,
322            src: Operand::Register(Register {
323                name: src.to_string(),
324                ty: PtxType::U32,
325            }),
326            membership_mask: 0xFFFF_FFFF,
327        });
328        Ok(name)
329    }
330
331    /// Store matrix m8n8x4 to shared memory (SM >= 90).
332    pub fn stmatrix_m8n8x4(&mut self, addr: &str, src: &str) -> Result<(), PtxGenError> {
333        if !self.target.capabilities().has_stmatrix {
334            return Err(PtxGenError::GenerationFailed(format!(
335                "stmatrix requires SM >= 90, target is {}",
336                self.target
337            )));
338        }
339        self.instructions.push(Instruction::Stmatrix {
340            dst_addr: Operand::Register(Register {
341                name: addr.to_string(),
342                ty: PtxType::U32,
343            }),
344            src: Register {
345                name: src.to_string(),
346                ty: PtxType::B32,
347            },
348            shape: StmatrixShape::M8n8x4,
349            trans: false,
350        });
351        Ok(())
352    }
353
354    /// Elect a single warp leader (SM >= 90). Returns predicate register name.
355    pub fn elect_sync(&mut self) -> Result<String, PtxGenError> {
356        if !self.target.capabilities().has_elect_one {
357            return Err(PtxGenError::GenerationFailed(format!(
358                "elect.sync requires SM >= 90, target is {}",
359                self.target
360            )));
361        }
362        let dst = self.regs.alloc(PtxType::Pred);
363        let name = dst.name.clone();
364        self.instructions.push(Instruction::ElectSync {
365            dst,
366            membership_mask: 0xFFFF_FFFF,
367        });
368        Ok(name)
369    }
370
371    /// Increase the maximum register count (SM >= 90).
372    pub fn setmaxnreg_inc(&mut self, count: u32) -> Result<(), PtxGenError> {
373        self.setmaxnreg_impl(count, SetmaxnregAction::Inc)
374    }
375
376    /// Decrease the maximum register count (SM >= 90).
377    pub fn setmaxnreg_dec(&mut self, count: u32) -> Result<(), PtxGenError> {
378        self.setmaxnreg_impl(count, SetmaxnregAction::Dec)
379    }
380
381    fn setmaxnreg_impl(&mut self, count: u32, action: SetmaxnregAction) -> Result<(), PtxGenError> {
382        if !self.target.capabilities().has_setmaxnreg {
383            return Err(PtxGenError::GenerationFailed(format!(
384                "setmaxnreg requires SM >= 90, target is {}",
385                self.target
386            )));
387        }
388        self.instructions.push(Instruction::Setmaxnreg {
389            reg_count: count,
390            action,
391        });
392        Ok(())
393    }
394
395    // ════════════════════════════════════════════════════════════════════
396    //  Barrier / Synchronization (SM >= 90)
397    // ════════════════════════════════════════════════════════════════════
398
399    /// Signal that dependent grids may launch (SM >= 90).
400    pub fn griddepcontrol_launch_dependents(&mut self) -> Result<(), PtxGenError> {
401        if !self.target.capabilities().has_griddepcontrol {
402            return Err(PtxGenError::GenerationFailed(format!(
403                "griddepcontrol requires SM >= 90, target is {}",
404                self.target
405            )));
406        }
407        self.instructions.push(Instruction::Griddepcontrol {
408            action: GridDepAction::LaunchDependents,
409        });
410        Ok(())
411    }
412
413    /// Wait for grid dependencies to complete (SM >= 90).
414    pub fn griddepcontrol_wait(&mut self) -> Result<(), PtxGenError> {
415        if !self.target.capabilities().has_griddepcontrol {
416            return Err(PtxGenError::GenerationFailed(format!(
417                "griddepcontrol requires SM >= 90, target is {}",
418                self.target
419            )));
420        }
421        self.instructions.push(Instruction::Griddepcontrol {
422            action: GridDepAction::Wait,
423        });
424        Ok(())
425    }
426
427    /// Emit a proxy fence for async operations.
428    pub fn fence_proxy_async(&mut self, scope: &str) -> Result<(), PtxGenError> {
429        let fence_scope = match scope {
430            "cta" => FenceScope::Cta,
431            "gpu" => FenceScope::Gpu,
432            "sys" => FenceScope::Sys,
433            other => {
434                return Err(PtxGenError::GenerationFailed(format!(
435                    "unknown fence scope: {other}"
436                )));
437            }
438        };
439        self.instructions.push(Instruction::FenceProxy {
440            scope: fence_scope,
441            space: MemorySpace::Shared,
442        });
443        Ok(())
444    }
445
446    /// Initialize an mbarrier in shared memory (SM >= 90).
447    pub fn mbarrier_init(&mut self, addr: &str, count: &str) -> Result<(), PtxGenError> {
448        if !self.target.capabilities().has_cluster_barriers {
449            return Err(PtxGenError::GenerationFailed(format!(
450                "mbarrier requires SM >= 90, target is {}",
451                self.target
452            )));
453        }
454        self.instructions.push(Instruction::MbarrierInit {
455            addr: Operand::Register(Register {
456                name: addr.to_string(),
457                ty: PtxType::U64,
458            }),
459            count: Operand::Register(Register {
460                name: count.to_string(),
461                ty: PtxType::U32,
462            }),
463        });
464        Ok(())
465    }
466
467    /// Signal arrival at an mbarrier (SM >= 90).
468    pub fn mbarrier_arrive(&mut self, addr: &str) -> Result<(), PtxGenError> {
469        if !self.target.capabilities().has_cluster_barriers {
470            return Err(PtxGenError::GenerationFailed(format!(
471                "mbarrier requires SM >= 90, target is {}",
472                self.target
473            )));
474        }
475        self.instructions.push(Instruction::MbarrierArrive {
476            addr: Operand::Register(Register {
477                name: addr.to_string(),
478                ty: PtxType::U64,
479            }),
480        });
481        Ok(())
482    }
483
484    /// Wait on an mbarrier phase (SM >= 90).
485    pub fn mbarrier_wait(&mut self, addr: &str, phase: &str) -> Result<(), PtxGenError> {
486        if !self.target.capabilities().has_cluster_barriers {
487            return Err(PtxGenError::GenerationFailed(format!(
488                "mbarrier requires SM >= 90, target is {}",
489                self.target
490            )));
491        }
492        self.instructions.push(Instruction::MbarrierWait {
493            addr: Operand::Register(Register {
494                name: addr.to_string(),
495                ty: PtxType::U64,
496            }),
497            phase: Operand::Register(Register {
498                name: phase.to_string(),
499                ty: PtxType::U32,
500            }),
501        });
502        Ok(())
503    }
504
505    // ════════════════════════════════════════════════════════════════════
506    //  FP4 (E2M1) Type Conversions — Blackwell (SM >= 100)
507    // ════════════════════════════════════════════════════════════════════
508
509    /// Convert an `f32` register to FP4 E2M1 format (SM >= 100, Blackwell).
510    ///
511    /// Emits `cvt.rn.e2m1.f32 dst, src;` and returns the destination register.
512    /// The FP4 result is stored in the low 4 bits of a B32 register container.
513    pub fn cvt_f32_to_e2m1(&mut self, src: Register) -> Result<Register, PtxGenError> {
514        if self.target < SmVersion::Sm100 {
515            return Err(PtxGenError::GenerationFailed(format!(
516                "cvt.e2m1 requires SM >= 100 (Blackwell), target is {}",
517                self.target
518            )));
519        }
520        let dst = self.regs.alloc(PtxType::E2M1);
521        self.instructions.push(Instruction::Cvt {
522            rnd: Some(RoundingMode::Rn),
523            dst_ty: PtxType::E2M1,
524            src_ty: PtxType::F32,
525            dst: dst.clone(),
526            src: Operand::Register(src),
527        });
528        Ok(dst)
529    }
530
531    /// Convert an FP4 E2M1 register to `f32` (SM >= 100, Blackwell).
532    ///
533    /// Emits `cvt.f32.e2m1 dst, src;` and returns the destination register.
534    pub fn cvt_e2m1_to_f32(&mut self, src: Register) -> Result<Register, PtxGenError> {
535        if self.target < SmVersion::Sm100 {
536            return Err(PtxGenError::GenerationFailed(format!(
537                "cvt.f32.e2m1 requires SM >= 100 (Blackwell), target is {}",
538                self.target
539            )));
540        }
541        let dst = self.regs.alloc(PtxType::F32);
542        self.instructions.push(Instruction::Cvt {
543            rnd: None,
544            dst_ty: PtxType::F32,
545            src_ty: PtxType::E2M1,
546            dst: dst.clone(),
547            src: Operand::Register(src),
548        });
549        Ok(dst)
550    }
551
552    // ════════════════════════════════════════════════════════════════════
553    //  tcgen05 MMA — 5th-gen Tensor Core (SM >= 100)
554    // ════════════════════════════════════════════════════════════════════
555
556    /// Emit a `tcgen05.mma.cta_group::1.kind::f32` instruction (SM >= 100).
557    ///
558    /// This is the Blackwell 5th-generation Tensor Core MMA that operates on
559    /// 128×256×256 E2M1 tiles referenced by descriptors stored in 64-bit registers.
560    pub fn tcgen05_mma_m128n256k256_e2m1(
561        &mut self,
562        a_desc: Register,
563        b_desc: Register,
564    ) -> Result<(), PtxGenError> {
565        if self.target < SmVersion::Sm100 {
566            return Err(PtxGenError::GenerationFailed(format!(
567                "tcgen05.mma requires SM >= 100 (Blackwell), target is {}",
568                self.target
569            )));
570        }
571        self.instructions
572            .push(Instruction::Tcgen05Mma { a_desc, b_desc });
573        Ok(())
574    }
575
576    // ════════════════════════════════════════════════════════════════════
577    //  Cluster-level Barrier & Fence — SM >= 90 (Hopper+)
578    // ════════════════════════════════════════════════════════════════════
579
580    /// Emit `barrier.cluster.arrive;` — signal cluster barrier (SM >= 90).
581    ///
582    /// All CTAs in the cluster must arrive before any may continue past
583    /// the corresponding `barrier.cluster.wait`.
584    pub fn barrier_cluster(&mut self) -> Result<(), PtxGenError> {
585        if !self.target.capabilities().has_cluster_barriers {
586            return Err(PtxGenError::GenerationFailed(format!(
587                "barrier.cluster requires SM >= 90, target is {}",
588                self.target
589            )));
590        }
591        self.instructions.push(Instruction::BarrierCluster);
592        Ok(())
593    }
594
595    /// Emit `fence.mbarrier_init.release.cluster;` — cluster release fence (SM >= 90).
596    ///
597    /// Ensures that all preceding memory operations (including mbarrier
598    /// initializations) are visible cluster-wide before the barrier is observed.
599    pub fn fence_cluster(&mut self) -> Result<(), PtxGenError> {
600        if !self.target.capabilities().has_cluster_barriers {
601            return Err(PtxGenError::GenerationFailed(format!(
602                "fence.cluster requires SM >= 90, target is {}",
603                self.target
604            )));
605        }
606        self.instructions.push(Instruction::FenceCluster);
607        Ok(())
608    }
609
610    // ════════════════════════════════════════════════════════════════════
611    //  TMA Descriptor-based Bulk Async Copy — SM >= 90 (Hopper+)
612    // ════════════════════════════════════════════════════════════════════
613
614    /// Emit a 1-D TMA descriptor-based bulk async copy (SM >= 90).
615    ///
616    /// Emits:
617    /// ```ptx
618    /// cp.async.bulk.tensor.1d.shared::cluster.global.tile.bulk_group
619    ///     [dst_smem], [src_gmem, {desc}];
620    /// ```
621    ///
622    /// `dst_smem` is the destination shared-memory address register,
623    /// `src_gmem` is the global-memory base address register, and
624    /// `desc` is the coordinate / descriptor register.
625    pub fn cp_async_bulk_tensor_1d(
626        &mut self,
627        dst_smem: Register,
628        src_gmem: Register,
629        desc: Register,
630    ) -> Result<(), PtxGenError> {
631        if !self.target.capabilities().has_bulk_copy {
632            return Err(PtxGenError::GenerationFailed(format!(
633                "cp.async.bulk.tensor requires SM >= 90, target is {}",
634                self.target
635            )));
636        }
637        self.instructions.push(Instruction::CpAsyncBulk {
638            dst_smem,
639            src_gmem,
640            desc,
641        });
642        Ok(())
643    }
644}