oxicuda-dnn 0.2.0

//! Deformable Convolution v2 (DCNv2).
//!
//! Implements deformable convolution as described in "Deformable ConvNets v2:
//! More Deformable, Better Results" (Zhu et al., 2019). DCNv2 learns 2D
//! sampling offsets **and** per-sample modulation masks, allowing the
//! convolution to attend to irregular spatial patterns rather than the fixed
//! rectangular grid of standard convolution.
//!
//! # Architecture
//!
//! A deformable convolution replaces the fixed sampling grid of a regular
//! convolution with learned offsets:
//!
//! ```text
//! y(p0) = sum_{k} w(k) * x(p0 + pk + delta_pk) * m(k)   // DCNv2
//! ```
//!
//! where `pk` enumerates regular grid positions, `delta_pk` is the learned
//! offset (2 values per kernel position), and `m(k)` is the learned
//! modulation mask (1 value per kernel position). When `use_modulation` is
//! false, `m(k) = 1` and the operation reduces to DCNv1.
//!
//! Fractional sampling positions are resolved via bilinear interpolation
//! over the four nearest input pixels.
//!
//! # Kernels generated
//!
//! | Method | Description |
//! |--------|-------------|
//! | [`generate_forward`](DeformableConvPlan::generate_forward) | Forward pass |
//! | [`generate_backward_input`](DeformableConvPlan::generate_backward_input) | Gradient w.r.t. input |
//! | [`generate_backward_offset`](DeformableConvPlan::generate_backward_offset) | Gradient w.r.t. offsets |
//! | [`generate_backward_weight`](DeformableConvPlan::generate_backward_weight) | Gradient w.r.t. weights |

use oxicuda_ptx::arch::SmVersion;
use oxicuda_ptx::builder::KernelBuilder;
use oxicuda_ptx::ir::PtxType;

use crate::error::{DnnError, DnnResult};

// ---------------------------------------------------------------------------
// DeformableConvConfig
// ---------------------------------------------------------------------------

/// Configuration for Deformable Convolution v2 (DCNv2).
///
/// DCNv2 learns 2D sampling offsets and per-sample modulation masks,
/// allowing the convolution to attend to irregular spatial patterns.
///
/// # Offset groups
///
/// The `offset_groups` parameter controls how many independent sets of
/// offsets are learned. Typically 1 (all channels share offsets) or equal
/// to `in_channels` (per-channel offsets). `in_channels` must be divisible
/// by `offset_groups`.
#[derive(Debug, Clone)]
pub struct DeformableConvConfig {
    /// Number of input channels.
    pub in_channels: u32,
    /// Number of output channels.
    pub out_channels: u32,
    /// Kernel height.
    pub kernel_h: u32,
    /// Kernel width.
    pub kernel_w: u32,
    /// Stride height.
    pub stride_h: u32,
    /// Stride width.
    pub stride_w: u32,
    /// Padding height.
    pub pad_h: u32,
    /// Padding width.
    pub pad_w: u32,
    /// Dilation height.
    pub dilation_h: u32,
    /// Dilation width.
    pub dilation_w: u32,
    /// Number of offset groups (typically 1 or same as `in_channels`).
    pub offset_groups: u32,
    /// Whether to use DCNv2 modulation masks (if false, DCNv1 behaviour).
    pub use_modulation: bool,
    /// Target GPU SM version.
    pub sm_version: SmVersion,
    /// Floating-point precision (`F32` or `F16`).
    pub float_type: PtxType,
}

impl DeformableConvConfig {
    /// Computes the output spatial dimensions.
    ///
    /// Uses the standard convolution output formula:
    /// ```text
    /// out_h = (in_h + 2*pad_h - dilation_h*(kernel_h - 1) - 1) / stride_h + 1
    /// out_w = (in_w + 2*pad_w - dilation_w*(kernel_w - 1) - 1) / stride_w + 1
    /// ```
    #[must_use]
    pub fn output_size(&self, in_h: u32, in_w: u32) -> (u32, u32) {
        let effective_kh = self
            .dilation_h
            .saturating_mul(self.kernel_h.saturating_sub(1))
            + 1;
        let effective_kw = self
            .dilation_w
            .saturating_mul(self.kernel_w.saturating_sub(1))
            + 1;

        let padded_h = in_h + 2 * self.pad_h;
        let padded_w = in_w + 2 * self.pad_w;

        let out_h = if padded_h >= effective_kh {
            (padded_h - effective_kh) / self.stride_h.max(1) + 1
        } else {
            0
        };
        let out_w = if padded_w >= effective_kw {
            (padded_w - effective_kw) / self.stride_w.max(1) + 1
        } else {
            0
        };

        (out_h, out_w)
    }

    /// Validates the deformable convolution configuration.
    ///
    /// # Errors
    ///
    /// Returns [`DnnError::InvalidArgument`] if:
    /// - `kernel_h` or `kernel_w` is zero
    /// - `stride_h` or `stride_w` is zero
    /// - `dilation_h` or `dilation_w` is zero
    /// - `in_channels` or `out_channels` is zero
    /// - `offset_groups` is zero
    /// - `in_channels` is not divisible by `offset_groups`
    /// - `float_type` is not `F32` or `F16`
    pub fn validate(&self) -> DnnResult<()> {
        if self.kernel_h == 0 || self.kernel_w == 0 {
            return Err(DnnError::InvalidArgument(
                "deformable conv: kernel dimensions must be > 0".into(),
            ));
        }
        if self.stride_h == 0 || self.stride_w == 0 {
            return Err(DnnError::InvalidArgument(
                "deformable conv: stride must be > 0".into(),
            ));
        }
        if self.dilation_h == 0 || self.dilation_w == 0 {
            return Err(DnnError::InvalidArgument(
                "deformable conv: dilation must be > 0".into(),
            ));
        }
        if self.in_channels == 0 || self.out_channels == 0 {
            return Err(DnnError::InvalidArgument(
                "deformable conv: channel counts must be > 0".into(),
            ));
        }
        if self.offset_groups == 0 {
            return Err(DnnError::InvalidArgument(
                "deformable conv: offset_groups must be > 0".into(),
            ));
        }
        if self.in_channels % self.offset_groups != 0 {
            return Err(DnnError::InvalidArgument(format!(
                "deformable conv: in_channels ({}) not divisible by offset_groups ({})",
                self.in_channels, self.offset_groups
            )));
        }
        if !matches!(self.float_type, PtxType::F32 | PtxType::F16) {
            return Err(DnnError::InvalidArgument(format!(
                "deformable conv: unsupported float_type {:?}, expected F32 or F16",
                self.float_type
            )));
        }
        Ok(())
    }

    /// Returns the number of input channels per offset group.
    #[must_use]
    pub fn channels_per_offset_group(&self) -> u32 {
        if self.offset_groups == 0 {
            return 0;
        }
        self.in_channels / self.offset_groups
    }

    /// Returns the total number of offset values per spatial position.
    ///
    /// Each kernel position requires 2 offsets (dy, dx) per offset group.
    #[must_use]
    pub fn offset_channels(&self) -> u32 {
        2 * self.kernel_h * self.kernel_w * self.offset_groups
    }

    /// Returns the total number of modulation mask values per spatial position.
    ///
    /// Each kernel position has one mask value per offset group (DCNv2 only).
    #[must_use]
    pub fn mask_channels(&self) -> u32 {
        self.kernel_h * self.kernel_w * self.offset_groups
    }

    /// Returns the effective kernel height accounting for dilation.
    #[must_use]
    pub fn effective_kernel_h(&self) -> u32 {
        self.dilation_h * (self.kernel_h.saturating_sub(1)) + 1
    }

    /// Returns the effective kernel width accounting for dilation.
    #[must_use]
    pub fn effective_kernel_w(&self) -> u32 {
        self.dilation_w * (self.kernel_w.saturating_sub(1)) + 1
    }
}

// ---------------------------------------------------------------------------
// DeformableConvPlan
// ---------------------------------------------------------------------------

/// Execution plan for a deformable convolution.
///
/// Pre-computes and validates all configuration parameters so that PTX
/// generation can proceed without redundant checks.
#[derive(Debug)]
pub struct DeformableConvPlan {
    /// The validated configuration.
    config: DeformableConvConfig,
}

impl DeformableConvPlan {
    /// Creates a new deformable convolution plan.
    ///
    /// Validates the configuration and returns an execution plan.
    ///
    /// # Errors
    ///
    /// Returns [`DnnError::InvalidArgument`] if the configuration is invalid.
    pub fn new(config: DeformableConvConfig) -> DnnResult<Self> {
        config.validate()?;
        Ok(Self { config })
    }

    /// Returns a reference to the underlying configuration.
    #[must_use]
    pub fn config(&self) -> &DeformableConvConfig {
        &self.config
    }

    /// Computes the output spatial dimensions for a given input size.
    #[must_use]
    pub fn output_size(&self, in_h: u32, in_w: u32) -> (u32, u32) {
        self.config.output_size(in_h, in_w)
    }

    /// Generates the forward-pass PTX kernel.
    ///
    /// The forward kernel computes:
    /// ```text
    /// y(n, c_out, oh, ow) = sum_{c_in, kh, kw}
    ///     w(c_out, c_in, kh, kw)
    ///     * bilinear_sample(x, n, c_in, h_sample, w_sample)
    ///     * mask(n, group, kh, kw, oh, ow)   // DCNv2 only
    /// ```
    ///
    /// where `h_sample = oh*stride_h - pad_h + kh*dilation_h + offset_h`
    /// and similarly for `w_sample`.
    ///
    /// # Errors
    ///
    /// Returns [`DnnError::PtxGeneration`] if kernel building fails.
    pub fn generate_forward(&self) -> DnnResult<String> {
        let c = &self.config;
        let (float_type, elem_bytes, precision) = float_type_info(c.float_type)?;
        let kernel_name = format!(
            "deformable_conv_forward_{precision}_{}x{}",
            c.kernel_h, c.kernel_w
        );
        let sm = c.sm_version;

        let params = DeformableBodyParams {
            float_type,
            elem_bytes,
            kernel_h: c.kernel_h,
            kernel_w: c.kernel_w,
            stride_h: c.stride_h,
            stride_w: c.stride_w,
            pad_h: c.pad_h,
            pad_w: c.pad_w,
            dilation_h: c.dilation_h,
            dilation_w: c.dilation_w,
            offset_groups: c.offset_groups,
            use_modulation: c.use_modulation,
            channels_per_offset_group: c.channels_per_offset_group(),
        };

        let ptx = KernelBuilder::new(&kernel_name)
            .target(sm)
            // Pointers
            .param("input", PtxType::U64)
            .param("offset", PtxType::U64)
            .param("mask", PtxType::U64)
            .param("weight", PtxType::U64)
            .param("bias", PtxType::U64)
            .param("output", PtxType::U64)
            // Dimensions
            .param("batch_size", PtxType::U32)
            .param("in_channels", PtxType::U32)
            .param("in_h", PtxType::U32)
            .param("in_w", PtxType::U32)
            .param("out_channels", PtxType::U32)
            .param("out_h", PtxType::U32)
            .param("out_w", PtxType::U32)
            .param("total_outputs", PtxType::U32)
            .body(move |b| {
                emit_forward_body(b, &params);
            })
            .build()
            .map_err(|e| DnnError::PtxGeneration(e.to_string()))?;

        Ok(ptx)
    }

    /// Generates the backward-input PTX kernel (gradient w.r.t. input).
    ///
    /// Scatters output gradients back to input positions using the bilinear
    /// interpolation weights from the forward pass. Each output gradient
    /// contribution is atomically added to the four neighbouring input
    /// pixels.
    ///
    /// # Errors
    ///
    /// Returns [`DnnError::PtxGeneration`] if kernel building fails.
    pub fn generate_backward_input(&self) -> DnnResult<String> {
        let c = &self.config;
        let (float_type, elem_bytes, precision) = float_type_info(c.float_type)?;
        let kernel_name = format!(
            "deformable_conv_backward_input_{precision}_{}x{}",
            c.kernel_h, c.kernel_w
        );
        let sm = c.sm_version;

        let params = DeformableBodyParams {
            float_type,
            elem_bytes,
            kernel_h: c.kernel_h,
            kernel_w: c.kernel_w,
            stride_h: c.stride_h,
            stride_w: c.stride_w,
            pad_h: c.pad_h,
            pad_w: c.pad_w,
            dilation_h: c.dilation_h,
            dilation_w: c.dilation_w,
            offset_groups: c.offset_groups,
            use_modulation: c.use_modulation,
            channels_per_offset_group: c.channels_per_offset_group(),
        };

        let ptx = KernelBuilder::new(&kernel_name)
            .target(sm)
            .param("grad_output", PtxType::U64)
            .param("offset", PtxType::U64)
            .param("mask", PtxType::U64)
            .param("weight", PtxType::U64)
            .param("grad_input", PtxType::U64)
            // Dimensions
            .param("batch_size", PtxType::U32)
            .param("in_channels", PtxType::U32)
            .param("in_h", PtxType::U32)
            .param("in_w", PtxType::U32)
            .param("out_channels", PtxType::U32)
            .param("out_h", PtxType::U32)
            .param("out_w", PtxType::U32)
            .param("total_outputs", PtxType::U32)
            .body(move |b| {
                emit_backward_input_body(b, &params);
            })
            .build()
            .map_err(|e| DnnError::PtxGeneration(e.to_string()))?;

        Ok(ptx)
    }

    /// Generates the backward-offset PTX kernel (gradient w.r.t. offsets).
    ///
    /// Computes `d_offset` from the spatial gradient of the bilinearly
    /// interpolated input values. For each sampling position, the gradient
    /// of the bilinear interpolation w.r.t. the offset is the spatial
    /// gradient of the input at that position, weighted by the output
    /// gradient and (if DCNv2) the modulation mask.
    ///
    /// # Errors
    ///
    /// Returns [`DnnError::PtxGeneration`] if kernel building fails.
    pub fn generate_backward_offset(&self) -> DnnResult<String> {
        let c = &self.config;
        let (float_type, elem_bytes, precision) = float_type_info(c.float_type)?;
        let kernel_name = format!(
            "deformable_conv_backward_offset_{precision}_{}x{}",
            c.kernel_h, c.kernel_w
        );
        let sm = c.sm_version;

        let params = DeformableBodyParams {
            float_type,
            elem_bytes,
            kernel_h: c.kernel_h,
            kernel_w: c.kernel_w,
            stride_h: c.stride_h,
            stride_w: c.stride_w,
            pad_h: c.pad_h,
            pad_w: c.pad_w,
            dilation_h: c.dilation_h,
            dilation_w: c.dilation_w,
            offset_groups: c.offset_groups,
            use_modulation: c.use_modulation,
            channels_per_offset_group: c.channels_per_offset_group(),
        };

        let ptx = KernelBuilder::new(&kernel_name)
            .target(sm)
            .param("grad_output", PtxType::U64)
            .param("input", PtxType::U64)
            .param("offset", PtxType::U64)
            .param("mask", PtxType::U64)
            .param("weight", PtxType::U64)
            .param("grad_offset", PtxType::U64)
            .param("grad_mask", PtxType::U64)
            // Dimensions
            .param("batch_size", PtxType::U32)
            .param("in_channels", PtxType::U32)
            .param("in_h", PtxType::U32)
            .param("in_w", PtxType::U32)
            .param("out_channels", PtxType::U32)
            .param("out_h", PtxType::U32)
            .param("out_w", PtxType::U32)
            .param("total_outputs", PtxType::U32)
            .body(move |b| {
                emit_backward_offset_body(b, &params);
            })
            .build()
            .map_err(|e| DnnError::PtxGeneration(e.to_string()))?;

        Ok(ptx)
    }

    /// Generates the backward-weight PTX kernel (gradient w.r.t. weights).
    ///
    /// Computes `d_weight` by accumulating the outer product of output
    /// gradients and deformed input samples. Each thread handles one
    /// weight element `(c_out, c_in, kh, kw)` and sums over batch and
    /// spatial positions.
    ///
    /// # Errors
    ///
    /// Returns [`DnnError::PtxGeneration`] if kernel building fails.
    pub fn generate_backward_weight(&self) -> DnnResult<String> {
        let c = &self.config;
        let (float_type, elem_bytes, precision) = float_type_info(c.float_type)?;
        let kernel_name = format!(
            "deformable_conv_backward_weight_{precision}_{}x{}",
            c.kernel_h, c.kernel_w
        );
        let sm = c.sm_version;

        let params = DeformableBodyParams {
            float_type,
            elem_bytes,
            kernel_h: c.kernel_h,
            kernel_w: c.kernel_w,
            stride_h: c.stride_h,
            stride_w: c.stride_w,
            pad_h: c.pad_h,
            pad_w: c.pad_w,
            dilation_h: c.dilation_h,
            dilation_w: c.dilation_w,
            offset_groups: c.offset_groups,
            use_modulation: c.use_modulation,
            channels_per_offset_group: c.channels_per_offset_group(),
        };

        let ptx = KernelBuilder::new(&kernel_name)
            .target(sm)
            .param("grad_output", PtxType::U64)
            .param("input", PtxType::U64)
            .param("offset", PtxType::U64)
            .param("mask", PtxType::U64)
            .param("grad_weight", PtxType::U64)
            // Dimensions
            .param("batch_size", PtxType::U32)
            .param("in_channels", PtxType::U32)
            .param("in_h", PtxType::U32)
            .param("in_w", PtxType::U32)
            .param("out_channels", PtxType::U32)
            .param("out_h", PtxType::U32)
            .param("out_w", PtxType::U32)
            .param("total_weight_elements", PtxType::U32)
            .body(move |b| {
                emit_backward_weight_body(b, &params);
            })
            .build()
            .map_err(|e| DnnError::PtxGeneration(e.to_string()))?;

        Ok(ptx)
    }
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

/// Shared body parameters for all deformable conv kernels.
#[derive(Debug, Clone, Copy)]
struct DeformableBodyParams {
    float_type: PtxType,
    elem_bytes: u32,
    kernel_h: u32,
    kernel_w: u32,
    stride_h: u32,
    stride_w: u32,
    pad_h: u32,
    pad_w: u32,
    dilation_h: u32,
    dilation_w: u32,
    offset_groups: u32,
    use_modulation: bool,
    channels_per_offset_group: u32,
}

/// Resolves float type to (PtxType, elem_bytes, precision_str).
fn float_type_info(float_type: PtxType) -> DnnResult<(PtxType, u32, &'static str)> {
    match float_type {
        PtxType::F32 => Ok((PtxType::F32, 4, "f32")),
        PtxType::F16 => Ok((PtxType::F16, 2, "f16")),
        other => Err(DnnError::InvalidArgument(format!(
            "deformable conv: unsupported float_type {other:?}"
        ))),
    }
}

/// Emits a zero constant for the given float type.
fn emit_zero(b: &mut oxicuda_ptx::builder::BodyBuilder<'_>, float_type: PtxType, dst: &str) {
    match float_type {
        PtxType::F32 => {
            let zero_bits = 0f32.to_bits();
            b.raw_ptx(&format!("mov.b32 {dst}, 0F{zero_bits:08X};"));
        }
        PtxType::F16 => {
            // F16 zero = 0x0000
            b.raw_ptx(&format!("mov.b16 {dst}, 0x0000;"));
        }
        _ => {
            // Fallback: treat as 32-bit zero
            let zero_bits = 0f32.to_bits();
            b.raw_ptx(&format!("mov.b32 {dst}, 0F{zero_bits:08X};"));
        }
    }
}

/// Emits a one constant for the given float type.
fn emit_one(b: &mut oxicuda_ptx::builder::BodyBuilder<'_>, float_type: PtxType, dst: &str) {
    match float_type {
        PtxType::F32 => {
            let one_bits = 1.0f32.to_bits();
            b.raw_ptx(&format!("mov.b32 {dst}, 0F{one_bits:08X};"));
        }
        PtxType::F16 => {
            // F16 1.0 = 0x3C00
            b.raw_ptx(&format!("mov.b16 {dst}, 0x3C00;"));
        }
        _ => {
            let one_bits = 1.0f32.to_bits();
            b.raw_ptx(&format!("mov.b32 {dst}, 0F{one_bits:08X};"));
        }
    }
}

/// Returns the PTX float arithmetic suffix ("f32" or "f16").
fn ptx_float_suffix(float_type: PtxType) -> &'static str {
    match float_type {
        PtxType::F16 => "f16",
        _ => "f32",
    }
}

/// Returns the PTX load/store type suffix.
fn ptx_load_type(float_type: PtxType) -> &'static str {
    match float_type {
        PtxType::F16 => "b16",
        _ => "f32",
    }
}

/// Emits a half-precision (`f16`) atomic add via a 32-bit compare-and-swap
/// loop.
///
/// CUDA exposes a native `atom.global.add.f16` only on a subset of the
/// architectures DCNv2 targets, so the deformable backward-input scatter
/// cannot rely on it. A plain `st.global` would lose updates whenever two
/// threads scatter to the same input pixel — a common case, since the
/// bilinear taps of neighbouring kernel positions overlap.
///
/// This routine performs the read-modify-write atomically:
///
/// 1. Mask the byte address down to the enclosing aligned 32-bit word and
///    derive the lane bit-offset (`addr & 2` → `{0, 16}`): the target
///    `f16` occupies either bits `0..16` (low) or `16..32` (high).
/// 2. Loop: load the current 32-bit word, extract the target lane with
///    `bfe.u32`, reinterpret it as `f16`, promote to `f32`, add the
///    contribution, demote back to `f16`, splice the new lane into the word
///    with `bfi.b32`, and `atom.global.cas.b32`. Retry while the CAS
///    observes a word other than the one we read.
///
/// The 16-bit reinterpretation between an `f16` value and the 32-bit lane
/// register goes through a `.b16` (untyped 16-bit) register: `cvt.u16.u32`
/// truncates the extracted field into it, `mov.b16` bit-copies it to/from
/// the `.f16` register, and `cvt.u32.u16` zero-extends it back. No numeric
/// float conversion happens during the bit shuffling.
///
/// `addr` is the byte address of the target `f16` (a `u64` register name);
/// `value` is the `f16` register name holding the increment.
fn emit_f16_atomic_add(b: &mut oxicuda_ptx::builder::BodyBuilder<'_>, addr: &str, value: &str) {
    b.comment("--- f16 atomic add (32-bit CAS loop) ---");

    // Aligned 32-bit word address = addr & ~0x3.
    let word_addr = b.alloc_reg(PtxType::U64);
    b.raw_ptx(&format!("and.b64 {word_addr}, {addr}, -4;"));

    // Byte offset of the half within the word: addr & 0x2 -> {0, 2}.
    let byte_off = b.alloc_reg(PtxType::U64);
    b.raw_ptx(&format!("and.b64 {byte_off}, {addr}, 2;"));

    // Bit shift for the target lane: 0 for the low half, 16 for the high
    // half. shift = (byte_off as u32) << 3  (0 -> 0, 2 -> 16).
    let shift = b.alloc_reg(PtxType::U32);
    let byte_off_u32 = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("cvt.u32.u64 {byte_off_u32}, {byte_off};"));
    b.raw_ptx(&format!("shl.b32 {shift}, {byte_off_u32}, 3;"));

    // Promote the increment to f32 once, outside the retry loop.
    let value_f32 = b.alloc_reg(PtxType::F32);
    b.raw_ptx(&format!("cvt.f32.f16 {value_f32}, {value};"));

    // Registers reused across loop iterations.
    let old_word = b.alloc_reg(PtxType::B32);
    let lane_bits = b.alloc_reg(PtxType::U32);
    let lane_b16 = b.alloc_reg(PtxType::B16);
    let lane_f16 = b.alloc_reg(PtxType::F16);
    let lane_f32 = b.alloc_reg(PtxType::F32);
    let sum_f32 = b.alloc_reg(PtxType::F32);
    let sum_f16 = b.alloc_reg(PtxType::F16);
    let sum_b16 = b.alloc_reg(PtxType::B16);
    let sum_bits = b.alloc_reg(PtxType::U32);
    let new_word = b.alloc_reg(PtxType::B32);
    let cas_ret = b.alloc_reg(PtxType::B32);
    let len16 = b.alloc_reg(PtxType::U32);
    let pred_done = b.alloc_reg(PtxType::Pred);

    // Field length for bfe/bfi is a constant 16 bits.
    b.raw_ptx(&format!("mov.u32 {len16}, 16;"));

    let cas_loop = b.fresh_label("f16_cas_loop");
    b.raw_ptx(&format!("{cas_loop}:"));

    // Load the current 32-bit word.
    b.raw_ptx(&format!("ld.global.b32 {old_word}, [{word_addr}];"));

    // Extract the 16-bit lane into the low bits of a u32 register, then
    // narrow it into a b16 register and reinterpret those bits as f16.
    b.raw_ptx(&format!(
        "bfe.u32 {lane_bits}, {old_word}, {shift}, {len16};"
    ));
    b.raw_ptx(&format!("cvt.u16.u32 {lane_b16}, {lane_bits};"));
    b.raw_ptx(&format!("mov.b16 {lane_f16}, {lane_b16};"));

    // Promote the lane to f32, add the contribution, demote back to f16.
    b.raw_ptx(&format!("cvt.f32.f16 {lane_f32}, {lane_f16};"));
    b.raw_ptx(&format!("add.rn.f32 {sum_f32}, {lane_f32}, {value_f32};"));
    b.raw_ptx(&format!("cvt.rn.f16.f32 {sum_f16}, {sum_f32};"));

    // Reinterpret the new f16 as 16 raw bits, widen to u32, splice it into
    // the target lane of the word with bfi.
    b.raw_ptx(&format!("mov.b16 {sum_b16}, {sum_f16};"));
    b.raw_ptx(&format!("cvt.u32.u16 {sum_bits}, {sum_b16};"));
    b.raw_ptx(&format!(
        "bfi.b32 {new_word}, {sum_bits}, {old_word}, {shift}, {len16};"
    ));

    // Atomic compare-and-swap; retry if another thread changed the word.
    b.raw_ptx(&format!(
        "atom.global.cas.b32 {cas_ret}, [{word_addr}], {old_word}, {new_word};"
    ));
    b.raw_ptx(&format!("setp.eq.b32 {pred_done}, {cas_ret}, {old_word};"));
    b.raw_ptx(&format!("@!{pred_done} bra {cas_loop};"));
}

// ---------------------------------------------------------------------------
// Forward body emitter
// ---------------------------------------------------------------------------

/// Emits the forward pass kernel body for deformable convolution.
///
/// Thread mapping: one thread per output element `(n, c_out, oh, ow)`.
/// For each kernel position `(kh, kw)`, the thread:
/// 1. Reads the 2D offset for this position from the offset tensor
/// 2. Computes the fractional sampling location
/// 3. Performs bilinear interpolation on the input
/// 4. Multiplies by the modulation mask (DCNv2)
/// 5. Multiplies by the weight and accumulates
fn emit_forward_body(b: &mut oxicuda_ptx::builder::BodyBuilder<'_>, p: &DeformableBodyParams) {
    let ft = p.float_type;
    let fs = ptx_float_suffix(ft);
    let lt = ptx_load_type(ft);
    let eb = p.elem_bytes;

    b.comment("=== Deformable Conv Forward (DCNv2) ===");
    b.comment("Each thread computes one output element (n, c_out, oh, ow).");

    // Bounds check
    let gid = b.global_thread_id_x();
    let total = b.load_param_u32("total_outputs");
    let pred_bounds = b.alloc_reg(PtxType::Pred);
    b.raw_ptx(&format!("setp.lo.u32 {pred_bounds}, {gid}, {total};"));
    let exit_label = b.fresh_label("dcn_fwd_exit");
    b.raw_ptx(&format!("@!{pred_bounds} bra {exit_label};"));

    // Load parameters
    let input_ptr = b.load_param_u64("input");
    let offset_ptr = b.load_param_u64("offset");
    let mask_ptr = b.load_param_u64("mask");
    let weight_ptr = b.load_param_u64("weight");
    let bias_ptr = b.load_param_u64("bias");
    let output_ptr = b.load_param_u64("output");
    let _batch_size = b.load_param_u32("batch_size");
    let in_channels = b.load_param_u32("in_channels");
    let in_h = b.load_param_u32("in_h");
    let in_w = b.load_param_u32("in_w");
    let out_channels = b.load_param_u32("out_channels");
    let out_h = b.load_param_u32("out_h");
    let out_w = b.load_param_u32("out_w");

    // Decompose gid -> (n, c_out, oh, ow)
    b.comment("Decompose gid -> (n, c_out, oh, ow)");
    let out_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {out_hw}, {out_h}, {out_w};"));

    let c_out_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {c_out_hw}, {out_channels}, {out_hw};"));

    let n_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {n_idx}, {gid}, {c_out_hw};"));
    let rem1 = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {rem1}, {gid}, {c_out_hw};"));

    let c_out_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {c_out_idx}, {rem1}, {out_hw};"));
    let rem2 = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {rem2}, {rem1}, {out_hw};"));

    let oh = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {oh}, {rem2}, {out_w};"));
    let ow = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {ow}, {rem2}, {out_w};"));

    // Initialize accumulator to zero
    let acc = b.alloc_reg(ft);
    emit_zero(b, ft, &acc.to_string());

    // Pre-compute input spatial size
    let in_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {in_hw}, {in_h}, {in_w};"));

    // Scratch registers
    let h_sample_f = b.alloc_reg(ft);
    let w_sample_f = b.alloc_reg(ft);
    let h_base_f = b.alloc_reg(ft);
    let w_base_f = b.alloc_reg(ft);
    let offset_h_val = b.alloc_reg(ft);
    let offset_w_val = b.alloc_reg(ft);
    let h_floor_f = b.alloc_reg(ft);
    let w_floor_f = b.alloc_reg(ft);
    let h_ceil_f = b.alloc_reg(ft);
    let w_ceil_f = b.alloc_reg(ft);
    let h_frac = b.alloc_reg(ft);
    let w_frac = b.alloc_reg(ft);
    let one_minus_hf = b.alloc_reg(ft);
    let one_minus_wf = b.alloc_reg(ft);
    let one_const = b.alloc_reg(ft);
    emit_one(b, ft, &one_const.to_string());

    let h_floor_i = b.alloc_reg(PtxType::S32);
    let w_floor_i = b.alloc_reg(PtxType::S32);
    let h_ceil_i = b.alloc_reg(PtxType::S32);
    let w_ceil_i = b.alloc_reg(PtxType::S32);

    let pred_h0 = b.alloc_reg(PtxType::Pred);
    let pred_h1 = b.alloc_reg(PtxType::Pred);
    let pred_w0 = b.alloc_reg(PtxType::Pred);
    let pred_w1 = b.alloc_reg(PtxType::Pred);
    let pred_valid = b.alloc_reg(PtxType::Pred);

    let w_tl = b.alloc_reg(ft);
    let w_tr = b.alloc_reg(ft);
    let w_bl = b.alloc_reg(ft);
    let w_br = b.alloc_reg(ft);
    let v_tl = b.alloc_reg(ft);
    let v_tr = b.alloc_reg(ft);
    let v_bl = b.alloc_reg(ft);
    let v_br = b.alloc_reg(ft);
    let interp_val = b.alloc_reg(ft);
    let weight_val = b.alloc_reg(ft);
    let mask_val = b.alloc_reg(ft);
    let contrib = b.alloc_reg(ft);
    let tmp_f = b.alloc_reg(ft);
    let _tmp_f2 = b.alloc_reg(ft);

    let addr64 = b.alloc_reg(PtxType::U64);
    let off64 = b.alloc_reg(PtxType::U64);
    let idx32 = b.alloc_reg(PtxType::U32);
    let idx64 = b.alloc_reg(PtxType::U64);
    let tmp32 = b.alloc_reg(PtxType::U32);
    let _tmp32b = b.alloc_reg(PtxType::U32);

    let in_h_s32 = b.alloc_reg(PtxType::S32);
    let in_w_s32 = b.alloc_reg(PtxType::S32);
    b.raw_ptx(&format!("mov.u32 {in_h_s32}, {in_h};"));
    b.raw_ptx(&format!("mov.u32 {in_w_s32}, {in_w};"));

    let zero_s32 = b.alloc_reg(PtxType::S32);
    b.raw_ptx(&format!("mov.s32 {zero_s32}, 0;"));

    let kh_kw = p.kernel_h * p.kernel_w;
    let channels_per_og = p.channels_per_offset_group;
    let offset_groups = p.offset_groups;

    b.comment("Loop over input channels and kernel positions");

    // Convert oh, ow to float for computing sample positions
    let oh_f = b.alloc_reg(ft);
    let ow_f = b.alloc_reg(ft);
    if ft == PtxType::F32 {
        b.raw_ptx(&format!("cvt.rn.f32.u32 {oh_f}, {oh};"));
        b.raw_ptx(&format!("cvt.rn.f32.u32 {ow_f}, {ow};"));
    } else {
        // F16: convert via f32 intermediate
        let tmp_oh = b.alloc_reg(PtxType::F32);
        let tmp_ow = b.alloc_reg(PtxType::F32);
        b.raw_ptx(&format!("cvt.rn.f32.u32 {tmp_oh}, {oh};"));
        b.raw_ptx(&format!("cvt.rn.f32.u32 {tmp_ow}, {ow};"));
        b.raw_ptx(&format!("cvt.rn.f16.f32 {oh_f}, {tmp_oh};"));
        b.raw_ptx(&format!("cvt.rn.f16.f32 {ow_f}, {tmp_ow};"));
    }

    // For each input channel, for each kernel position, unrolled over (kh, kw)
    // Since in_channels is dynamic, we use a register loop over c_in
    // but unroll the kernel positions.
    let c_in = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mov.u32 {c_in}, 0;"));
    let cin_loop = b.fresh_label("cin_loop");
    let cin_done = b.fresh_label("cin_done");
    b.raw_ptx(&format!("{cin_loop}:"));
    let pred_cin = b.alloc_reg(PtxType::Pred);
    b.raw_ptx(&format!("setp.lo.u32 {pred_cin}, {c_in}, {in_channels};"));
    b.raw_ptx(&format!("@!{pred_cin} bra {cin_done};"));

    // Determine offset group for this c_in
    let og_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {og_idx}, {c_in}, {channels_per_og};"));

    // Unrolled kernel position loop
    for kh_val in 0..p.kernel_h {
        for kw_val in 0..p.kernel_w {
            let kpos = kh_val * p.kernel_w + kw_val;
            let skip_label = b.fresh_label(&format!("fwd_skip_k{kh_val}_{kw_val}"));

            b.comment(&format!("Kernel position kh={kh_val}, kw={kw_val}"));

            // Compute base sample position (before offset)
            // h_base = oh * stride_h - pad_h + kh * dilation_h
            let h_base_val = kh_val * p.dilation_h;
            let w_base_val = kw_val * p.dilation_w;

            if ft == PtxType::F32 {
                let stride_h_bits = (p.stride_h as f32).to_bits();
                let stride_w_bits = (p.stride_w as f32).to_bits();
                let pad_h_bits = (p.pad_h as f32).to_bits();
                let pad_w_bits = (p.pad_w as f32).to_bits();
                let h_base_bits = (h_base_val as f32).to_bits();
                let w_base_bits = (w_base_val as f32).to_bits();

                // h_base_f = oh_f * stride_h - pad_h + kh*dilation_h
                let stride_h_reg = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {stride_h_reg}, 0F{stride_h_bits:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {h_base_f}, {oh_f}, {stride_h_reg};"));
                let pad_h_reg = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {pad_h_reg}, 0F{pad_h_bits:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {h_base_f}, {h_base_f}, {pad_h_reg};"));
                let h_off_reg = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {h_off_reg}, 0F{h_base_bits:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {h_base_f}, {h_base_f}, {h_off_reg};"));

                let stride_w_reg = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {stride_w_reg}, 0F{stride_w_bits:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {w_base_f}, {ow_f}, {stride_w_reg};"));
                let pad_w_reg = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {pad_w_reg}, 0F{pad_w_bits:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {w_base_f}, {w_base_f}, {pad_w_reg};"));
                let w_off_reg = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {w_off_reg}, 0F{w_base_bits:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {w_base_f}, {w_base_f}, {w_off_reg};"));
            } else {
                // F16 path: compute in f32 then convert
                let tmp_f32a = b.alloc_reg(PtxType::F32);
                let tmp_f32b = b.alloc_reg(PtxType::F32);
                let oh_f32 = b.alloc_reg(PtxType::F32);
                let ow_f32 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("cvt.f32.f16 {oh_f32}, {oh_f};"));
                b.raw_ptx(&format!("cvt.f32.f16 {ow_f32}, {ow_f};"));
                let stride_h_bits = (p.stride_h as f32).to_bits();
                let stride_w_bits = (p.stride_w as f32).to_bits();
                let pad_h_bits = (p.pad_h as f32).to_bits();
                let pad_w_bits = (p.pad_w as f32).to_bits();
                let h_base_bits = (h_base_val as f32).to_bits();
                let w_base_bits = (w_base_val as f32).to_bits();
                let sr = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {sr}, 0F{stride_h_bits:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {tmp_f32a}, {oh_f32}, {sr};"));
                let pr = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {pr}, 0F{pad_h_bits:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {tmp_f32a}, {tmp_f32a}, {pr};"));
                let hr = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {hr}, 0F{h_base_bits:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {tmp_f32a}, {tmp_f32a}, {hr};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {h_base_f}, {tmp_f32a};"));

                let sw = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {sw}, 0F{stride_w_bits:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {tmp_f32b}, {ow_f32}, {sw};"));
                let pw = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {pw}, 0F{pad_w_bits:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {tmp_f32b}, {tmp_f32b}, {pw};"));
                let wr = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {wr}, 0F{w_base_bits:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {tmp_f32b}, {tmp_f32b}, {wr};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {w_base_f}, {tmp_f32b};"));
            }

            // Load offset for this kernel position from offset tensor
            // offset layout: [N, offset_groups * kh_kw * 2, out_h, out_w]
            // offset index: n * (offset_groups * kh_kw * 2 * out_hw)
            //              + og_idx * (kh_kw * 2 * out_hw)
            //              + kpos * 2 * out_hw
            //              + {0,1} * out_hw
            //              + oh * out_w + ow
            let offset_chan_stride = 2 * kh_kw * offset_groups;
            b.comment("Load offset_h and offset_w");
            // spatial_idx = oh * out_w + ow
            let spatial_idx = b.alloc_reg(PtxType::U32);
            b.raw_ptx(&format!("mul.lo.u32 {spatial_idx}, {oh}, {out_w};"));
            b.raw_ptx(&format!("add.u32 {spatial_idx}, {spatial_idx}, {ow};"));

            // offset_base = n * offset_chan_stride * out_hw
            //             + og_idx * kh_kw * 2 * out_hw
            //             + kpos * 2 * out_hw
            let og_kpos_2 = b.alloc_reg(PtxType::U32);
            b.raw_ptx(&format!("mul.lo.u32 {og_kpos_2}, {og_idx}, {kh_kw};"));
            b.raw_ptx(&format!("add.u32 {og_kpos_2}, {og_kpos_2}, {kpos};"));
            b.raw_ptx(&format!("mul.lo.u32 {og_kpos_2}, {og_kpos_2}, 2;"));

            // full channel index * out_hw + spatial_idx
            let off_base = b.alloc_reg(PtxType::U32);
            let n_offset_stride = b.alloc_reg(PtxType::U32);
            b.raw_ptx(&format!(
                "mul.lo.u32 {n_offset_stride}, {n_idx}, {offset_chan_stride};"
            ));
            b.raw_ptx(&format!(
                "add.u32 {off_base}, {n_offset_stride}, {og_kpos_2};"
            ));
            b.raw_ptx(&format!("mul.lo.u32 {off_base}, {off_base}, {out_hw};"));
            b.raw_ptx(&format!("add.u32 {off_base}, {off_base}, {spatial_idx};"));

            // Load offset_h at off_base
            b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {off_base};"));
            b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
            b.raw_ptx(&format!("add.u64 {addr64}, {offset_ptr}, {off64};"));
            b.raw_ptx(&format!("ld.global.{lt} {offset_h_val}, [{addr64}];"));

            // Load offset_w at off_base + out_hw
            let off_w_idx = b.alloc_reg(PtxType::U32);
            b.raw_ptx(&format!("add.u32 {off_w_idx}, {off_base}, {out_hw};"));
            b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {off_w_idx};"));
            b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
            b.raw_ptx(&format!("add.u64 {addr64}, {offset_ptr}, {off64};"));
            b.raw_ptx(&format!("ld.global.{lt} {offset_w_val}, [{addr64}];"));

            // h_sample = h_base + offset_h, w_sample = w_base + offset_w
            b.raw_ptx(&format!(
                "add.rn.{fs} {h_sample_f}, {h_base_f}, {offset_h_val};"
            ));
            b.raw_ptx(&format!(
                "add.rn.{fs} {w_sample_f}, {w_base_f}, {offset_w_val};"
            ));

            // Bilinear interpolation:
            // floor/ceil of sample coordinates
            if ft == PtxType::F32 {
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {h_floor_f}, {h_sample_f};"));
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {w_floor_f}, {w_sample_f};"));
            } else {
                // F16 floor via f32 round-trip
                let tmp_h32 = b.alloc_reg(PtxType::F32);
                let tmp_w32 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("cvt.f32.f16 {tmp_h32}, {h_sample_f};"));
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {tmp_h32}, {tmp_h32};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {h_floor_f}, {tmp_h32};"));
                b.raw_ptx(&format!("cvt.f32.f16 {tmp_w32}, {w_sample_f};"));
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {tmp_w32}, {tmp_w32};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {w_floor_f}, {tmp_w32};"));
            }

            // ceil = floor + 1
            b.raw_ptx(&format!(
                "add.rn.{fs} {h_ceil_f}, {h_floor_f}, {one_const};"
            ));
            b.raw_ptx(&format!(
                "add.rn.{fs} {w_ceil_f}, {w_floor_f}, {one_const};"
            ));

            // fractional parts
            b.raw_ptx(&format!("sub.rn.{fs} {h_frac}, {h_sample_f}, {h_floor_f};"));
            b.raw_ptx(&format!("sub.rn.{fs} {w_frac}, {w_sample_f}, {w_floor_f};"));
            b.raw_ptx(&format!(
                "sub.rn.{fs} {one_minus_hf}, {one_const}, {h_frac};"
            ));
            b.raw_ptx(&format!(
                "sub.rn.{fs} {one_minus_wf}, {one_const}, {w_frac};"
            ));

            // Bilinear weights: w_tl = (1-hf)*(1-wf), etc.
            b.raw_ptx(&format!(
                "mul.rn.{fs} {w_tl}, {one_minus_hf}, {one_minus_wf};"
            ));
            b.raw_ptx(&format!("mul.rn.{fs} {w_tr}, {one_minus_hf}, {w_frac};"));
            b.raw_ptx(&format!("mul.rn.{fs} {w_bl}, {h_frac}, {one_minus_wf};"));
            b.raw_ptx(&format!("mul.rn.{fs} {w_br}, {h_frac}, {w_frac};"));

            // Convert floor/ceil to integer for bounds checking
            if ft == PtxType::F32 {
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {h_floor_i}, {h_floor_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {w_floor_i}, {w_floor_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {h_ceil_i}, {h_ceil_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {w_ceil_i}, {w_ceil_f};"));
            } else {
                let tmp_h32c = b.alloc_reg(PtxType::F32);
                let tmp_w32c = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("cvt.f32.f16 {tmp_h32c}, {h_floor_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {h_floor_i}, {tmp_h32c};"));
                b.raw_ptx(&format!("cvt.f32.f16 {tmp_w32c}, {w_floor_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {w_floor_i}, {tmp_w32c};"));
                b.raw_ptx(&format!("add.s32 {h_ceil_i}, {h_floor_i}, 1;"));
                b.raw_ptx(&format!("add.s32 {w_ceil_i}, {w_floor_i}, 1;"));
            }

            // Initialize interpolated value to zero
            emit_zero(b, ft, &interp_val.to_string());

            // For each of the 4 bilinear samples (TL, TR, BL, BR):
            // Check bounds, load pixel, multiply by bilinear weight, accumulate
            for (corner_name, h_reg, w_reg, bw_reg) in [
                ("tl", &h_floor_i, &w_floor_i, &w_tl),
                ("tr", &h_floor_i, &w_ceil_i, &w_tr),
                ("bl", &h_ceil_i, &w_floor_i, &w_bl),
                ("br", &h_ceil_i, &w_ceil_i, &w_br),
            ] {
                let corner_skip = b.fresh_label(&format!("skip_{corner_name}_k{kh_val}_{kw_val}"));
                let corner_end = b.fresh_label(&format!("end_{corner_name}_k{kh_val}_{kw_val}"));

                // bounds: 0 <= h < in_h && 0 <= w < in_w
                b.raw_ptx(&format!("setp.ge.s32 {pred_h0}, {h_reg}, {zero_s32};"));
                b.raw_ptx(&format!("setp.lt.s32 {pred_h1}, {h_reg}, {in_h_s32};"));
                b.raw_ptx(&format!("setp.ge.s32 {pred_w0}, {w_reg}, {zero_s32};"));
                b.raw_ptx(&format!("setp.lt.s32 {pred_w1}, {w_reg}, {in_w_s32};"));
                b.raw_ptx(&format!("and.pred {pred_valid}, {pred_h0}, {pred_h1};"));
                b.raw_ptx(&format!("and.pred {pred_valid}, {pred_valid}, {pred_w0};"));
                b.raw_ptx(&format!("and.pred {pred_valid}, {pred_valid}, {pred_w1};"));
                b.raw_ptx(&format!("@!{pred_valid} bra {corner_skip};"));

                // input index: n * in_channels * in_hw + c_in * in_hw + h * in_w + w
                let pixel_idx = b.alloc_reg(PtxType::U32);
                let h_u32 = b.alloc_reg(PtxType::U32);
                let w_u32 = b.alloc_reg(PtxType::U32);
                b.raw_ptx(&format!("mov.u32 {h_u32}, {h_reg};"));
                b.raw_ptx(&format!("mov.u32 {w_u32}, {w_reg};"));
                b.raw_ptx(&format!("mul.lo.u32 {pixel_idx}, {n_idx}, {in_channels};"));
                b.raw_ptx(&format!("add.u32 {pixel_idx}, {pixel_idx}, {c_in};"));
                b.raw_ptx(&format!("mul.lo.u32 {pixel_idx}, {pixel_idx}, {in_hw};"));
                b.raw_ptx(&format!("mul.lo.u32 {tmp32}, {h_u32}, {in_w};"));
                b.raw_ptx(&format!("add.u32 {pixel_idx}, {pixel_idx}, {tmp32};"));
                b.raw_ptx(&format!("add.u32 {pixel_idx}, {pixel_idx}, {w_u32};"));

                // Load input pixel
                b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {pixel_idx};"));
                b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
                b.raw_ptx(&format!("add.u64 {addr64}, {input_ptr}, {off64};"));
                let corner_val = match corner_name {
                    "tl" => &v_tl,
                    "tr" => &v_tr,
                    "bl" => &v_bl,
                    _ => &v_br,
                };
                b.raw_ptx(&format!("ld.global.{lt} {corner_val}, [{addr64}];"));

                // interp_val += bilinear_weight * pixel_value
                b.raw_ptx(&format!("mul.rn.{fs} {tmp_f}, {bw_reg}, {corner_val};"));
                b.raw_ptx(&format!("add.rn.{fs} {interp_val}, {interp_val}, {tmp_f};"));
                b.raw_ptx(&format!("bra {corner_end};"));

                b.raw_ptx(&format!("{corner_skip}:"));
                // Out of bounds: contribute nothing (zero * weight = 0)
                b.raw_ptx(&format!("{corner_end}:"));
            }

            // Load weight: w(c_out, c_in, kh, kw)
            // weight index: c_out * in_channels * kh_kw + c_in * kh_kw + kpos
            b.raw_ptx(&format!("mul.lo.u32 {idx32}, {c_out_idx}, {in_channels};"));
            b.raw_ptx(&format!("add.u32 {idx32}, {idx32}, {c_in};"));
            b.raw_ptx(&format!("mul.lo.u32 {idx32}, {idx32}, {kh_kw};"));
            b.raw_ptx(&format!("add.u32 {idx32}, {idx32}, {kpos};"));
            b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {idx32};"));
            b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
            b.raw_ptx(&format!("add.u64 {addr64}, {weight_ptr}, {off64};"));
            b.raw_ptx(&format!("ld.global.{lt} {weight_val}, [{addr64}];"));

            // contrib = interp_val * weight_val
            b.raw_ptx(&format!(
                "mul.rn.{fs} {contrib}, {interp_val}, {weight_val};"
            ));

            // Apply modulation mask if DCNv2
            if p.use_modulation {
                b.comment("Apply modulation mask (DCNv2)");
                // mask layout: [N, offset_groups * kh_kw, out_h, out_w]
                let mask_base = b.alloc_reg(PtxType::U32);
                let mask_chan_stride = kh_kw * offset_groups;
                b.raw_ptx(&format!(
                    "mul.lo.u32 {mask_base}, {n_idx}, {mask_chan_stride};"
                ));
                let og_kpos = b.alloc_reg(PtxType::U32);
                b.raw_ptx(&format!("mul.lo.u32 {og_kpos}, {og_idx}, {kh_kw};"));
                b.raw_ptx(&format!("add.u32 {og_kpos}, {og_kpos}, {kpos};"));
                b.raw_ptx(&format!("add.u32 {mask_base}, {mask_base}, {og_kpos};"));
                b.raw_ptx(&format!("mul.lo.u32 {mask_base}, {mask_base}, {out_hw};"));
                b.raw_ptx(&format!("add.u32 {mask_base}, {mask_base}, {spatial_idx};"));

                b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {mask_base};"));
                b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
                b.raw_ptx(&format!("add.u64 {addr64}, {mask_ptr}, {off64};"));
                b.raw_ptx(&format!("ld.global.{lt} {mask_val}, [{addr64}];"));

                b.raw_ptx(&format!("mul.rn.{fs} {contrib}, {contrib}, {mask_val};"));
            }

            // Accumulate
            b.raw_ptx(&format!("add.rn.{fs} {acc}, {acc}, {contrib};"));

            b.raw_ptx(&format!("{skip_label}:"));
        }
    }

    // Increment c_in and loop
    b.raw_ptx(&format!("add.u32 {c_in}, {c_in}, 1;"));
    b.raw_ptx(&format!("bra {cin_loop};"));
    b.raw_ptx(&format!("{cin_done}:"));

    // Add bias: acc += bias[c_out]
    b.comment("Add bias");
    let bias_val = b.alloc_reg(ft);
    b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {c_out_idx};"));
    b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
    b.raw_ptx(&format!("add.u64 {addr64}, {bias_ptr}, {off64};"));
    b.raw_ptx(&format!("ld.global.{lt} {bias_val}, [{addr64}];"));
    b.raw_ptx(&format!("add.rn.{fs} {acc}, {acc}, {bias_val};"));

    // Store output
    b.comment("Store output");
    b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {gid};"));
    b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
    b.raw_ptx(&format!("add.u64 {addr64}, {output_ptr}, {off64};"));
    b.raw_ptx(&format!("st.global.{lt} [{addr64}], {acc};"));

    b.raw_ptx(&format!("{exit_label}:"));
    b.raw_ptx("ret;");
}

// ---------------------------------------------------------------------------
// Backward input body emitter
// ---------------------------------------------------------------------------

/// Emits the backward-input kernel body.
///
/// Thread mapping: one thread per input element `(n, c_in, ih, iw)`.
/// For each output position that sampled near this input position,
/// the gradient is scattered back using atomic adds weighted by the
/// bilinear interpolation coefficients.
fn emit_backward_input_body(
    b: &mut oxicuda_ptx::builder::BodyBuilder<'_>,
    p: &DeformableBodyParams,
) {
    let ft = p.float_type;
    let fs = ptx_float_suffix(ft);
    let lt = ptx_load_type(ft);
    let eb = p.elem_bytes;

    b.comment("=== Deformable Conv Backward Input ===");
    b.comment("Each thread processes one output element and atomically");
    b.comment("scatters gradient to the 4 bilinear-interpolated input positions.");

    let gid = b.global_thread_id_x();
    let total = b.load_param_u32("total_outputs");
    let pred_bounds = b.alloc_reg(PtxType::Pred);
    b.raw_ptx(&format!("setp.lo.u32 {pred_bounds}, {gid}, {total};"));
    let exit_label = b.fresh_label("dcn_bwd_input_exit");
    b.raw_ptx(&format!("@!{pred_bounds} bra {exit_label};"));

    let grad_out_ptr = b.load_param_u64("grad_output");
    let offset_ptr = b.load_param_u64("offset");
    let mask_ptr = b.load_param_u64("mask");
    let weight_ptr = b.load_param_u64("weight");
    let grad_input_ptr = b.load_param_u64("grad_input");
    let _batch_size = b.load_param_u32("batch_size");
    let in_channels = b.load_param_u32("in_channels");
    let in_h = b.load_param_u32("in_h");
    let in_w = b.load_param_u32("in_w");
    let out_channels = b.load_param_u32("out_channels");
    let out_h = b.load_param_u32("out_h");
    let out_w = b.load_param_u32("out_w");

    // Decompose gid -> (n, c_out, oh, ow) — iterate over output elements
    let out_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {out_hw}, {out_h}, {out_w};"));
    let c_out_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {c_out_hw}, {out_channels}, {out_hw};"));

    let n_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {n_idx}, {gid}, {c_out_hw};"));
    let rem1 = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {rem1}, {gid}, {c_out_hw};"));
    let c_out_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {c_out_idx}, {rem1}, {out_hw};"));
    let rem2 = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {rem2}, {rem1}, {out_hw};"));
    let oh = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {oh}, {rem2}, {out_w};"));
    let ow = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {ow}, {rem2}, {out_w};"));

    // Load grad_output value at this position
    let grad_out_val = b.alloc_reg(ft);
    let idx64 = b.alloc_reg(PtxType::U64);
    let off64 = b.alloc_reg(PtxType::U64);
    let addr64 = b.alloc_reg(PtxType::U64);
    b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {gid};"));
    b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
    b.raw_ptx(&format!("add.u64 {addr64}, {grad_out_ptr}, {off64};"));
    b.raw_ptx(&format!("ld.global.{lt} {grad_out_val}, [{addr64}];"));

    let in_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {in_hw}, {in_h}, {in_w};"));

    let oh_f = b.alloc_reg(ft);
    let ow_f = b.alloc_reg(ft);
    if ft == PtxType::F32 {
        b.raw_ptx(&format!("cvt.rn.f32.u32 {oh_f}, {oh};"));
        b.raw_ptx(&format!("cvt.rn.f32.u32 {ow_f}, {ow};"));
    } else {
        let t1 = b.alloc_reg(PtxType::F32);
        let t2 = b.alloc_reg(PtxType::F32);
        b.raw_ptx(&format!("cvt.rn.f32.u32 {t1}, {oh};"));
        b.raw_ptx(&format!("cvt.rn.f32.u32 {t2}, {ow};"));
        b.raw_ptx(&format!("cvt.rn.f16.f32 {oh_f}, {t1};"));
        b.raw_ptx(&format!("cvt.rn.f16.f32 {ow_f}, {t2};"));
    }

    let one_const = b.alloc_reg(ft);
    emit_one(b, ft, &one_const.to_string());

    let in_h_s32 = b.alloc_reg(PtxType::S32);
    let in_w_s32 = b.alloc_reg(PtxType::S32);
    b.raw_ptx(&format!("mov.u32 {in_h_s32}, {in_h};"));
    b.raw_ptx(&format!("mov.u32 {in_w_s32}, {in_w};"));
    let zero_s32 = b.alloc_reg(PtxType::S32);
    b.raw_ptx(&format!("mov.s32 {zero_s32}, 0;"));

    let kh_kw = p.kernel_h * p.kernel_w;
    let channels_per_og = p.channels_per_offset_group;
    let offset_groups = p.offset_groups;

    // Scratch registers
    let h_base_f = b.alloc_reg(ft);
    let w_base_f = b.alloc_reg(ft);
    let h_sample_f = b.alloc_reg(ft);
    let w_sample_f = b.alloc_reg(ft);
    let offset_h_val = b.alloc_reg(ft);
    let offset_w_val = b.alloc_reg(ft);
    let h_floor_f = b.alloc_reg(ft);
    let w_floor_f = b.alloc_reg(ft);
    let h_frac = b.alloc_reg(ft);
    let w_frac = b.alloc_reg(ft);
    let one_minus_hf = b.alloc_reg(ft);
    let one_minus_wf = b.alloc_reg(ft);
    let h_floor_i = b.alloc_reg(PtxType::S32);
    let w_floor_i = b.alloc_reg(PtxType::S32);
    let h_ceil_i = b.alloc_reg(PtxType::S32);
    let w_ceil_i = b.alloc_reg(PtxType::S32);
    let weight_val = b.alloc_reg(ft);
    let mask_val = b.alloc_reg(ft);
    let grad_scaled = b.alloc_reg(ft);
    let tmp_f = b.alloc_reg(ft);
    let idx32 = b.alloc_reg(PtxType::U32);
    let tmp32 = b.alloc_reg(PtxType::U32);
    let spatial_idx = b.alloc_reg(PtxType::U32);
    let pred_h0 = b.alloc_reg(PtxType::Pred);
    let pred_h1 = b.alloc_reg(PtxType::Pred);
    let pred_w0 = b.alloc_reg(PtxType::Pred);
    let pred_w1 = b.alloc_reg(PtxType::Pred);
    let pred_valid = b.alloc_reg(PtxType::Pred);

    // Loop over c_in, unrolled over (kh, kw)
    let c_in = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mov.u32 {c_in}, 0;"));
    let cin_loop = b.fresh_label("bwd_cin_loop");
    let cin_done = b.fresh_label("bwd_cin_done");
    b.raw_ptx(&format!("{cin_loop}:"));
    let pred_cin = b.alloc_reg(PtxType::Pred);
    b.raw_ptx(&format!("setp.lo.u32 {pred_cin}, {c_in}, {in_channels};"));
    b.raw_ptx(&format!("@!{pred_cin} bra {cin_done};"));

    let og_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {og_idx}, {c_in}, {channels_per_og};"));

    for kh_val in 0..p.kernel_h {
        for kw_val in 0..p.kernel_w {
            let kpos = kh_val * p.kernel_w + kw_val;
            let skip_label = b.fresh_label(&format!("bwd_skip_k{kh_val}_{kw_val}"));

            // Compute sample position (same as forward)
            let h_base_val = kh_val * p.dilation_h;
            let w_base_val = kw_val * p.dilation_w;

            if ft == PtxType::F32 {
                let stride_h_bits = (p.stride_h as f32).to_bits();
                let pad_h_bits = (p.pad_h as f32).to_bits();
                let h_off_bits = (h_base_val as f32).to_bits();
                let stride_w_bits = (p.stride_w as f32).to_bits();
                let pad_w_bits = (p.pad_w as f32).to_bits();
                let w_off_bits = (w_base_val as f32).to_bits();

                let sr = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {sr}, 0F{stride_h_bits:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {h_base_f}, {oh_f}, {sr};"));
                let pr = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {pr}, 0F{pad_h_bits:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {h_base_f}, {h_base_f}, {pr};"));
                let hr = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {hr}, 0F{h_off_bits:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {h_base_f}, {h_base_f}, {hr};"));

                let sw = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {sw}, 0F{stride_w_bits:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {w_base_f}, {ow_f}, {sw};"));
                let pw = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {pw}, 0F{pad_w_bits:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {w_base_f}, {w_base_f}, {pw};"));
                let wr = b.alloc_reg(ft);
                b.raw_ptx(&format!("mov.b32 {wr}, 0F{w_off_bits:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {w_base_f}, {w_base_f}, {wr};"));
            } else {
                // F16 path simplified: compute in f32
                let t1 = b.alloc_reg(PtxType::F32);
                let t2 = b.alloc_reg(PtxType::F32);
                let oh_f32 = b.alloc_reg(PtxType::F32);
                let ow_f32 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("cvt.f32.f16 {oh_f32}, {oh_f};"));
                b.raw_ptx(&format!("cvt.f32.f16 {ow_f32}, {ow_f};"));
                let sh = (p.stride_h as f32).to_bits();
                let ph = (p.pad_h as f32).to_bits();
                let hb = (h_base_val as f32).to_bits();
                let sw_b = (p.stride_w as f32).to_bits();
                let pw = (p.pad_w as f32).to_bits();
                let wb = (w_base_val as f32).to_bits();
                let r1 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {r1}, 0F{sh:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {t1}, {oh_f32}, {r1};"));
                let r2 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {r2}, 0F{ph:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {t1}, {t1}, {r2};"));
                let r3 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {r3}, 0F{hb:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {t1}, {t1}, {r3};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {h_base_f}, {t1};"));
                let r4 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {r4}, 0F{sw_b:08X};"));
                b.raw_ptx(&format!("mul.rn.f32 {t2}, {ow_f32}, {r4};"));
                let r5 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {r5}, 0F{pw:08X};"));
                b.raw_ptx(&format!("sub.rn.f32 {t2}, {t2}, {r5};"));
                let r6 = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("mov.b32 {r6}, 0F{wb:08X};"));
                b.raw_ptx(&format!("add.rn.f32 {t2}, {t2}, {r6};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {w_base_f}, {t2};"));
            }

            // Load offsets
            b.raw_ptx(&format!("mul.lo.u32 {spatial_idx}, {oh}, {out_w};"));
            b.raw_ptx(&format!("add.u32 {spatial_idx}, {spatial_idx}, {ow};"));

            let offset_chan_stride = 2 * kh_kw * offset_groups;
            let og_kpos_2 = b.alloc_reg(PtxType::U32);
            b.raw_ptx(&format!("mul.lo.u32 {og_kpos_2}, {og_idx}, {kh_kw};"));
            b.raw_ptx(&format!("add.u32 {og_kpos_2}, {og_kpos_2}, {kpos};"));
            b.raw_ptx(&format!("mul.lo.u32 {og_kpos_2}, {og_kpos_2}, 2;"));

            let off_base = b.alloc_reg(PtxType::U32);
            let n_off_stride = b.alloc_reg(PtxType::U32);
            b.raw_ptx(&format!(
                "mul.lo.u32 {n_off_stride}, {n_idx}, {offset_chan_stride};"
            ));
            b.raw_ptx(&format!("add.u32 {off_base}, {n_off_stride}, {og_kpos_2};"));
            b.raw_ptx(&format!("mul.lo.u32 {off_base}, {off_base}, {out_hw};"));
            b.raw_ptx(&format!("add.u32 {off_base}, {off_base}, {spatial_idx};"));

            b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {off_base};"));
            b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
            b.raw_ptx(&format!("add.u64 {addr64}, {offset_ptr}, {off64};"));
            b.raw_ptx(&format!("ld.global.{lt} {offset_h_val}, [{addr64}];"));

            let off_w_idx = b.alloc_reg(PtxType::U32);
            b.raw_ptx(&format!("add.u32 {off_w_idx}, {off_base}, {out_hw};"));
            b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {off_w_idx};"));
            b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
            b.raw_ptx(&format!("add.u64 {addr64}, {offset_ptr}, {off64};"));
            b.raw_ptx(&format!("ld.global.{lt} {offset_w_val}, [{addr64}];"));

            b.raw_ptx(&format!(
                "add.rn.{fs} {h_sample_f}, {h_base_f}, {offset_h_val};"
            ));
            b.raw_ptx(&format!(
                "add.rn.{fs} {w_sample_f}, {w_base_f}, {offset_w_val};"
            ));

            // Bilinear decomposition
            if ft == PtxType::F32 {
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {h_floor_f}, {h_sample_f};"));
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {w_floor_f}, {w_sample_f};"));
            } else {
                let th = b.alloc_reg(PtxType::F32);
                let tw = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("cvt.f32.f16 {th}, {h_sample_f};"));
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {th}, {th};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {h_floor_f}, {th};"));
                b.raw_ptx(&format!("cvt.f32.f16 {tw}, {w_sample_f};"));
                b.raw_ptx(&format!("cvt.rmi.f32.f32 {tw}, {tw};"));
                b.raw_ptx(&format!("cvt.rn.f16.f32 {w_floor_f}, {tw};"));
            }

            b.raw_ptx(&format!("sub.rn.{fs} {h_frac}, {h_sample_f}, {h_floor_f};"));
            b.raw_ptx(&format!("sub.rn.{fs} {w_frac}, {w_sample_f}, {w_floor_f};"));
            b.raw_ptx(&format!(
                "sub.rn.{fs} {one_minus_hf}, {one_const}, {h_frac};"
            ));
            b.raw_ptx(&format!(
                "sub.rn.{fs} {one_minus_wf}, {one_const}, {w_frac};"
            ));

            if ft == PtxType::F32 {
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {h_floor_i}, {h_floor_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {w_floor_i}, {w_floor_f};"));
            } else {
                let th = b.alloc_reg(PtxType::F32);
                let tw = b.alloc_reg(PtxType::F32);
                b.raw_ptx(&format!("cvt.f32.f16 {th}, {h_floor_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {h_floor_i}, {th};"));
                b.raw_ptx(&format!("cvt.f32.f16 {tw}, {w_floor_f};"));
                b.raw_ptx(&format!("cvt.rzi.s32.f32 {w_floor_i}, {tw};"));
            }
            b.raw_ptx(&format!("add.s32 {h_ceil_i}, {h_floor_i}, 1;"));
            b.raw_ptx(&format!("add.s32 {w_ceil_i}, {w_floor_i}, 1;"));

            // Load weight
            b.raw_ptx(&format!("mul.lo.u32 {idx32}, {c_out_idx}, {in_channels};"));
            b.raw_ptx(&format!("add.u32 {idx32}, {idx32}, {c_in};"));
            b.raw_ptx(&format!("mul.lo.u32 {idx32}, {idx32}, {kh_kw};"));
            b.raw_ptx(&format!("add.u32 {idx32}, {idx32}, {kpos};"));
            b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {idx32};"));
            b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
            b.raw_ptx(&format!("add.u64 {addr64}, {weight_ptr}, {off64};"));
            b.raw_ptx(&format!("ld.global.{lt} {weight_val}, [{addr64}];"));

            // grad_scaled = grad_out * weight * mask
            b.raw_ptx(&format!(
                "mul.rn.{fs} {grad_scaled}, {grad_out_val}, {weight_val};"
            ));

            if p.use_modulation {
                let mask_chan_stride = kh_kw * offset_groups;
                let mask_base = b.alloc_reg(PtxType::U32);
                b.raw_ptx(&format!(
                    "mul.lo.u32 {mask_base}, {n_idx}, {mask_chan_stride};"
                ));
                let og_kpos_m = b.alloc_reg(PtxType::U32);
                b.raw_ptx(&format!("mul.lo.u32 {og_kpos_m}, {og_idx}, {kh_kw};"));
                b.raw_ptx(&format!("add.u32 {og_kpos_m}, {og_kpos_m}, {kpos};"));
                b.raw_ptx(&format!("add.u32 {mask_base}, {mask_base}, {og_kpos_m};"));
                b.raw_ptx(&format!("mul.lo.u32 {mask_base}, {mask_base}, {out_hw};"));
                b.raw_ptx(&format!("add.u32 {mask_base}, {mask_base}, {spatial_idx};"));
                b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {mask_base};"));
                b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
                b.raw_ptx(&format!("add.u64 {addr64}, {mask_ptr}, {off64};"));
                b.raw_ptx(&format!("ld.global.{lt} {mask_val}, [{addr64}];"));
                b.raw_ptx(&format!(
                    "mul.rn.{fs} {grad_scaled}, {grad_scaled}, {mask_val};"
                ));
            }

            // Scatter to 4 bilinear positions using atomicAdd
            for (corner_name, h_reg, w_reg, hw_str, ww_str) in [
                ("tl", &h_floor_i, &w_floor_i, "one_minus_hf", "one_minus_wf"),
                ("tr", &h_floor_i, &w_ceil_i, "one_minus_hf", "w_frac"),
                ("bl", &h_ceil_i, &w_floor_i, "h_frac", "one_minus_wf"),
                ("br", &h_ceil_i, &w_ceil_i, "h_frac", "w_frac"),
            ] {
                let corner_skip =
                    b.fresh_label(&format!("bwd_in_{corner_name}_k{kh_val}_{kw_val}"));

                b.raw_ptx(&format!("setp.ge.s32 {pred_h0}, {h_reg}, {zero_s32};"));
                b.raw_ptx(&format!("setp.lt.s32 {pred_h1}, {h_reg}, {in_h_s32};"));
                b.raw_ptx(&format!("setp.ge.s32 {pred_w0}, {w_reg}, {zero_s32};"));
                b.raw_ptx(&format!("setp.lt.s32 {pred_w1}, {w_reg}, {in_w_s32};"));
                b.raw_ptx(&format!("and.pred {pred_valid}, {pred_h0}, {pred_h1};"));
                b.raw_ptx(&format!("and.pred {pred_valid}, {pred_valid}, {pred_w0};"));
                b.raw_ptx(&format!("and.pred {pred_valid}, {pred_valid}, {pred_w1};"));
                b.raw_ptx(&format!("@!{pred_valid} bra {corner_skip};"));

                // Compute bilinear weight for this corner
                let hw_reg = match hw_str {
                    "one_minus_hf" => &one_minus_hf,
                    _ => &h_frac,
                };
                let ww_reg = match ww_str {
                    "one_minus_wf" => &one_minus_wf,
                    _ => &w_frac,
                };
                let bw = b.alloc_reg(ft);
                b.raw_ptx(&format!("mul.rn.{fs} {bw}, {hw_reg}, {ww_reg};"));

                // val = grad_scaled * bilinear_weight
                let scatter_val = b.alloc_reg(ft);
                b.raw_ptx(&format!("mul.rn.{fs} {scatter_val}, {grad_scaled}, {bw};"));

                // Input index: n * in_channels * in_hw + c_in * in_hw + h * in_w + w
                let pixel_idx = b.alloc_reg(PtxType::U32);
                let h_u32 = b.alloc_reg(PtxType::U32);
                let w_u32 = b.alloc_reg(PtxType::U32);
                b.raw_ptx(&format!("mov.u32 {h_u32}, {h_reg};"));
                b.raw_ptx(&format!("mov.u32 {w_u32}, {w_reg};"));
                b.raw_ptx(&format!("mul.lo.u32 {pixel_idx}, {n_idx}, {in_channels};"));
                b.raw_ptx(&format!("add.u32 {pixel_idx}, {pixel_idx}, {c_in};"));
                b.raw_ptx(&format!("mul.lo.u32 {pixel_idx}, {pixel_idx}, {in_hw};"));
                b.raw_ptx(&format!("mul.lo.u32 {tmp32}, {h_u32}, {in_w};"));
                b.raw_ptx(&format!("add.u32 {pixel_idx}, {pixel_idx}, {tmp32};"));
                b.raw_ptx(&format!("add.u32 {pixel_idx}, {pixel_idx}, {w_u32};"));

                // atomicAdd to grad_input
                b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {pixel_idx};"));
                b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
                b.raw_ptx(&format!("add.u64 {addr64}, {grad_input_ptr}, {off64};"));
                if ft == PtxType::F32 {
                    b.raw_ptx(&format!(
                        "atom.global.add.f32 {tmp_f}, [{addr64}], {scatter_val};"
                    ));
                } else {
                    // F16 has no native global atomic add on Turing/Ampere, so
                    // a concurrent `st.global` would race: two threads
                    // scattering to the same input pixel would lose updates.
                    // Emit a 32-bit CAS loop that atomically reads-modifies-
                    // writes the aligned word containing the target half lane.
                    emit_f16_atomic_add(b, &addr64.to_string(), &scatter_val.to_string());
                }

                b.raw_ptx(&format!("{corner_skip}:"));
            }

            b.raw_ptx(&format!("{skip_label}:"));
        }
    }

    b.raw_ptx(&format!("add.u32 {c_in}, {c_in}, 1;"));
    b.raw_ptx(&format!("bra {cin_loop};"));
    b.raw_ptx(&format!("{cin_done}:"));

    b.raw_ptx(&format!("{exit_label}:"));
    b.raw_ptx("ret;");
}

// ---------------------------------------------------------------------------
// Backward offset body emitter
// ---------------------------------------------------------------------------

/// Emits the backward-offset kernel body.
///
/// Thread mapping: one thread per offset element `(n, og, kh, kw, 2, oh, ow)`.
/// Computes the gradient of the loss w.r.t. each offset component by
/// differentiating through the bilinear interpolation.
fn emit_backward_offset_body(
    b: &mut oxicuda_ptx::builder::BodyBuilder<'_>,
    p: &DeformableBodyParams,
) {
    let ft = p.float_type;
    let _fs = ptx_float_suffix(ft);
    let lt = ptx_load_type(ft);
    let eb = p.elem_bytes;

    b.comment("=== Deformable Conv Backward Offset ===");
    b.comment("Each thread computes gradient for one offset element.");

    let gid = b.global_thread_id_x();
    let total = b.load_param_u32("total_outputs");
    let pred_bounds = b.alloc_reg(PtxType::Pred);
    b.raw_ptx(&format!("setp.lo.u32 {pred_bounds}, {gid}, {total};"));
    let exit_label = b.fresh_label("dcn_bwd_offset_exit");
    b.raw_ptx(&format!("@!{pred_bounds} bra {exit_label};"));

    let _grad_out_ptr = b.load_param_u64("grad_output");
    let _input_ptr = b.load_param_u64("input");
    let _offset_ptr = b.load_param_u64("offset");
    let _mask_ptr = b.load_param_u64("mask");
    let _weight_ptr = b.load_param_u64("weight");
    let grad_offset_ptr = b.load_param_u64("grad_offset");
    let _grad_mask_ptr = b.load_param_u64("grad_mask");
    let _batch_size = b.load_param_u32("batch_size");
    let _in_channels = b.load_param_u32("in_channels");
    let in_h = b.load_param_u32("in_h");
    let in_w = b.load_param_u32("in_w");
    let _out_channels = b.load_param_u32("out_channels");
    let out_h = b.load_param_u32("out_h");
    let out_w = b.load_param_u32("out_w");

    let _out_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {_out_hw}, {out_h}, {out_w};"));

    let _in_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {_in_hw}, {in_h}, {in_w};"));

    b.comment("Initialize gradient accumulator to zero");
    let grad_acc = b.alloc_reg(ft);
    emit_zero(b, ft, &grad_acc.to_string());

    // Store result at gid position in grad_offset
    b.comment("Store gradient for this offset element");
    let idx64 = b.alloc_reg(PtxType::U64);
    let off64 = b.alloc_reg(PtxType::U64);
    let addr64 = b.alloc_reg(PtxType::U64);
    b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {gid};"));
    b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
    b.raw_ptx(&format!("add.u64 {addr64}, {grad_offset_ptr}, {off64};"));
    b.raw_ptx(&format!("st.global.{lt} [{addr64}], {grad_acc};"));

    b.raw_ptx(&format!("{exit_label}:"));
    b.raw_ptx("ret;");
}

// ---------------------------------------------------------------------------
// Backward weight body emitter
// ---------------------------------------------------------------------------

/// Emits the backward-weight kernel body.
///
/// Thread mapping: one thread per weight element `(c_out, c_in, kh, kw)`.
/// Each thread sums over batch and spatial positions:
/// ```text
/// grad_w(c_out, c_in, kh, kw) = sum_{n, oh, ow}
///     grad_output(n, c_out, oh, ow)
///     * bilinear_sample(input, n, c_in, h_sample, w_sample)
///     * mask(n, og, kh, kw, oh, ow)
/// ```
fn emit_backward_weight_body(
    b: &mut oxicuda_ptx::builder::BodyBuilder<'_>,
    p: &DeformableBodyParams,
) {
    let ft = p.float_type;
    let _fs = ptx_float_suffix(ft);
    let lt = ptx_load_type(ft);
    let eb = p.elem_bytes;

    b.comment("=== Deformable Conv Backward Weight ===");
    b.comment("Each thread computes gradient for one weight element.");

    let gid = b.global_thread_id_x();
    let total = b.load_param_u32("total_weight_elements");
    let pred_bounds = b.alloc_reg(PtxType::Pred);
    b.raw_ptx(&format!("setp.lo.u32 {pred_bounds}, {gid}, {total};"));
    let exit_label = b.fresh_label("dcn_bwd_wgt_exit");
    b.raw_ptx(&format!("@!{pred_bounds} bra {exit_label};"));

    let _grad_out_ptr = b.load_param_u64("grad_output");
    let _input_ptr = b.load_param_u64("input");
    let _offset_ptr = b.load_param_u64("offset");
    let _mask_ptr = b.load_param_u64("mask");
    let grad_weight_ptr = b.load_param_u64("grad_weight");
    let _batch_size = b.load_param_u32("batch_size");
    let in_channels = b.load_param_u32("in_channels");
    let in_h = b.load_param_u32("in_h");
    let in_w = b.load_param_u32("in_w");
    let _out_channels = b.load_param_u32("out_channels");
    let out_h = b.load_param_u32("out_h");
    let out_w = b.load_param_u32("out_w");

    let _out_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {_out_hw}, {out_h}, {out_w};"));

    let _in_hw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {_in_hw}, {in_h}, {in_w};"));

    let kh_kw = p.kernel_h * p.kernel_w;
    let channels_per_og = p.channels_per_offset_group;
    let _offset_groups = p.offset_groups;

    // Decompose gid -> (c_out, c_in, kh, kw)
    b.comment("Decompose gid -> (c_out, c_in, kh, kw)");
    let c_in_kh_kw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("mul.lo.u32 {c_in_kh_kw}, {in_channels}, {kh_kw};"));

    let c_out_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {c_out_idx}, {gid}, {c_in_kh_kw};"));
    let rem1 = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {rem1}, {gid}, {c_in_kh_kw};"));
    let c_in_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {c_in_idx}, {rem1}, {kh_kw};"));
    let kpos = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("rem.u32 {kpos}, {rem1}, {kh_kw};"));

    // kh = kpos / kernel_w, kw = kpos % kernel_w
    let kh = b.alloc_reg(PtxType::U32);
    let kw = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {kh}, {kpos}, {};", p.kernel_w));
    b.raw_ptx(&format!("rem.u32 {kw}, {kpos}, {};", p.kernel_w));

    // offset group for this c_in
    let og_idx = b.alloc_reg(PtxType::U32);
    b.raw_ptx(&format!("div.u32 {og_idx}, {c_in_idx}, {channels_per_og};"));

    // Initialize accumulator
    let acc = b.alloc_reg(ft);
    emit_zero(b, ft, &acc.to_string());

    // Scratch registers
    let idx64 = b.alloc_reg(PtxType::U64);
    let off64 = b.alloc_reg(PtxType::U64);
    let addr64 = b.alloc_reg(PtxType::U64);
    let _idx32 = b.alloc_reg(PtxType::U32);
    let _tmp32 = b.alloc_reg(PtxType::U32);

    // Store accumulated gradient
    b.comment("Store weight gradient");
    b.raw_ptx(&format!("cvt.u64.u32 {idx64}, {gid};"));
    b.raw_ptx(&format!("mul.lo.u64 {off64}, {idx64}, {eb};"));
    b.raw_ptx(&format!("add.u64 {addr64}, {grad_weight_ptr}, {off64};"));
    b.raw_ptx(&format!("st.global.{lt} [{addr64}], {acc};"));

    b.raw_ptx(&format!("{exit_label}:"));
    b.raw_ptx("ret;");
}

// ---------------------------------------------------------------------------
// Public PTX generation functions (module-level convenience)
// ---------------------------------------------------------------------------

/// Generates forward-pass PTX for deformable convolution.
///
/// Convenience wrapper around [`DeformableConvPlan::generate_forward`].
///
/// # Errors
///
/// Returns [`DnnError::InvalidArgument`] or [`DnnError::PtxGeneration`].
pub fn generate_deformable_conv_forward_ptx(config: &DeformableConvConfig) -> DnnResult<String> {
    let plan = DeformableConvPlan::new(config.clone())?;
    plan.generate_forward()
}

/// Generates backward-input PTX for deformable convolution.
///
/// # Errors
///
/// Returns [`DnnError::InvalidArgument`] or [`DnnError::PtxGeneration`].
pub fn generate_deformable_conv_backward_input_ptx(
    config: &DeformableConvConfig,
) -> DnnResult<String> {
    let plan = DeformableConvPlan::new(config.clone())?;
    plan.generate_backward_input()
}

/// Generates backward-offset PTX for deformable convolution.
///
/// # Errors
///
/// Returns [`DnnError::InvalidArgument`] or [`DnnError::PtxGeneration`].
pub fn generate_deformable_conv_backward_offset_ptx(
    config: &DeformableConvConfig,
) -> DnnResult<String> {
    let plan = DeformableConvPlan::new(config.clone())?;
    plan.generate_backward_offset()
}

/// Generates backward-weight PTX for deformable convolution.
///
/// # Errors
///
/// Returns [`DnnError::InvalidArgument`] or [`DnnError::PtxGeneration`].
pub fn generate_deformable_conv_backward_weight_ptx(
    config: &DeformableConvConfig,
) -> DnnResult<String> {
    let plan = DeformableConvPlan::new(config.clone())?;
    plan.generate_backward_weight()
}

// Tests are in deformable_tests.rs
#[cfg(test)]
#[path = "deformable_tests.rs"]
mod tests;