realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
impl CudaExecutor {
    /// Phase 3-5: Attention + output projection + residual1
    #[allow(clippy::too_many_arguments, clippy::too_many_lines)]
    fn workspace_attention_residual_phase(
        &mut self,
        input: &GpuBuffer<f32>,
        hidden_buf1: &GpuBuffer<f32>,
        q_buf: &GpuBuffer<f32>,
        k_buf: &GpuBuffer<f32>,
        v_buf: &GpuBuffer<f32>,
        attn_out_buf: &GpuBuffer<f32>,
        input_staging: &GpuBuffer<f32>,
        layer_idx: usize,
        layer_weights: &ValidatedLayerWeights,
        hidden_dim: u32,
        q_dim: u32,
        skip_debug: bool,
        profiling: bool,
    ) -> Result<(), GpuError> {
        // 3. PAR-051: Incremental attention into pre-allocated workspace buffer
        // Eliminates 28 GPU allocations per token
        // PAR-054-FIX: Use capture-safe version during graph capture to skip debug sync
        let timer_attn = if profiling {
            self.start_brick_id(trueno::BrickId::AttentionScore)
        } else {
            None
        };
        // GH-559 DIAGNOSTIC: Force non-capture attention path to test if
        // incremental_attention_into_for_capture has a bug on sm_121.
        // The _for_capture variant may produce wrong results on Blackwell.
        let _seq_len = self.incremental_attention_into(layer_idx, q_buf, k_buf, v_buf, attn_out_buf)?;
        if profiling {
            self.stop_brick_id(timer_attn, 1);
        }

        // PAR-058-DEBUG: Check attention output (skip during graph capture)
        // Note: attention runs on compute_stream, so sync that first
        if !skip_debug && layer_idx < 4 {
            self.compute_stream.synchronize()?;
            self.debug_check_buf(attn_out_buf, "Attn", layer_idx)?;
        }

        // GH-559-PERF: Event-based cross-stream dependency (was: compute_stream.synchronize()).
        // Attention runs on compute_stream, output projection reads attn_out_buf on self.stream.
        // Record event on compute_stream, then make self.stream wait for it.
        // This preserves correctness (output projection waits for attention) without blocking
        // the CPU or stalling other GPU work — saves ~0.5-1.0ms/token (28 layers × per-token).
        //
        // Five-Whys (F-PARITY-02): realizr 1.59x slower → serving overhead → full GPU sync
        // 28x per token in attention phase → cuStreamSynchronize blocks CPU+GPU →
        // replaced with cuStreamWaitEvent (non-blocking GPU-side ordering).
        {
            let event = self.attention_event.get_or_insert_with(|| {
                CudaEvent::new().expect("GH-559-PERF: attention event creation")
            });
            self.compute_stream.record_event(event)?;
            self.stream.wait_event(event)?;
        }

        // PMAT-027: Invalidate Q8 cache — input is now attn_out_buf (different from QKV's hidden_buf1).
        self.q8_activation_valid = false;

        // 4. Output projection: attn_out_buf -> hidden_buf1 (reuse, normed no longer needed)
        let timer_oproj = if profiling {
            self.start_brick_id(trueno::BrickId::OutputProjection)
        } else {
            None
        };
        self.gemv_dispatch(
            layer_weights.attn_output_qtype,
            layer_weights.attn_output_ptr,
            attn_out_buf, hidden_buf1, hidden_dim, q_dim,
        )?;
        if profiling {
            self.stop_brick_id(timer_oproj, 1);
        }

        // PAR-058-DEBUG: Check output projection (skip during graph capture)
        if !skip_debug && layer_idx < 4 {
            self.debug_check_buf(hidden_buf1, "Output proj", layer_idx)?;
        }

        // 5. First residual: input + projected -> input_staging (PAR-044 FIX)
        // NOTE: Using input_staging instead of hidden_buf2 to avoid read/write conflict
        // when input IS hidden_buf2 (layers 1+)
        // PAR-075: Cannot fuse with RmsNorm2 because we need input_staging for second residual
        let timer_res1 = if profiling {
            self.start_brick_timer("Residual1")
        } else {
            None
        };
        self.residual_add_into(input, hidden_buf1, input_staging, hidden_dim)?;
        if profiling {
            self.stop_brick_timer(timer_res1, 1);
        }

        // PAR-058-DEBUG: Check residual1 output (skip during graph capture)
        if !skip_debug && layer_idx < 4 {
            self.debug_check_buf(input_staging, "Residual1", layer_idx)?;
        }

        Ok(())
    }
}