rustyasg 0.4.1 - Docs.rs

//! # Automatic Differentiation Module
//!
//! This module implements **graph-to-graph automatic differentiation** (autograd)
//! using reverse-mode differentiation (backpropagation).
//!
//! ## How It Works
//!
//! Instead of computing gradients during the forward pass, this module builds
//! a **separate gradient graph** that computes ∂loss/∂params when executed.
//!
//! Key features:
//! - Correct handling of **broadcasting** in element-wise operations
//! - Full support for **Parameter nodes** (trainable weights)
//! - **Graph-level** differentiation - gradients are computed symbolically
//!
//! ## Example
//!
//! ```ignore
//! use rustyasg::autograd::Gradients;
//!
//! // Build forward graph (already done via Tensor API)
//! let forward_graph = context.borrow().main_graph().clone();
//!
//! // Build gradient graph
//! let grad_graph = Gradients::new(forward_graph)
//!     .build(loss_node_id, &[weight_node_id, bias_node_id])
//!     .unwrap();
//!
//! // Execute gradient graph on backend to get actual gradient values
//! ```
//!
//! ## Supported Operations
//!
//! The autograd system supports gradients for:
//! - Arithmetic: Add, Subtract, Multiply, Divide, Power
//! - Matrix operations: MatrixMultiply
//! - Activations: ReLU, Sigmoid, Tanh, Softmax, GELU, SiLU, LeakyReLU
//! - Reductions: Sum
//! - Convolutions: Conv2d
//! - Normalization: LayerNorm (via composite operations)

use crate::analysis::shape_inference::{ShapeInference, ShapeInferenceError};
use crate::asg::{Asg, AsgError, NodeId, NodeType, Value};
use std::collections::{HashMap, HashSet};
use thiserror::Error;

#[derive(Error, Debug, Clone, PartialEq)]
pub enum AutogradError {
    #[error("Graph error while building gradients: {0}")]
    Asg(#[from] AsgError),

    #[error("Shape inference error in gradient graph: {0}")]
    Shape(#[from] ShapeInferenceError),

    #[error(
        "Operation '{0}' does not support automatic differentiation. \
             Consider using an alternative operation or implement backward for it."
    )]
    UnsupportedOperation(String),

    #[error(
        "Gradient for node {0} not found. \
             Ensure the node is connected to the loss through differentiable operations."
    )]
    GradientNotFound(NodeId),
}

/// Gradients built for a single target node.
pub struct Gradients {
    src: Asg,
    grad: Asg,
    map: HashMap<NodeId, NodeId>, // src_node -> grad_node
}

impl Gradients {
    /// Start building gradients for the source graph `src`.
    pub fn new(src: Asg) -> Self {
        let grad_id = src.id.wrapping_add(1);
        Self {
            src: src.clone(),
            grad: Asg::new(grad_id, Some("grad_graph".into())),
            map: HashMap::new(),
        }
    }

    /// Build the gradient graph `∂loss/∂wrt`.
    /// Returns a new ASG whose outputs are gradients for each node in `wrt`.
    pub fn build(mut self, loss: NodeId, wrt: &[NodeId]) -> Result<Asg, AutogradError> {
        // 1. Topological sort from loss
        let order = self.topo(loss)?;
        // 2. Create "1" for ∂loss/∂loss
        let one = self.lit_scalar(1.0);
        self.map.insert(loss, one);

        // 3. Backward pass to build the main gradient graph
        for &node in order.iter().rev() {
            if !self.map.contains_key(&node) {
                continue;
            }
            let g_out = self.map[&node];
            self.backward_node(node, g_out)?;
        }

        // 4. Run ShapeInference ONCE when the graph is nearly ready
        let mut initial_shapes = HashMap::new();
        for n in self.src.nodes.values() {
            if let (Some(s), Some(dt)) = (&n.shape, &n.dtype) {
                let ext_name = self.ext_name(n.id);
                initial_shapes.insert(ext_name, (s.clone(), *dt));
            }
        }
        self.grad.set_outputs(self.map.values().copied().collect());
        ShapeInference::run(&mut self.grad, &initial_shapes)?;

        // 5. Adjust gradients for broadcast operations by adding ReduceSumTo
        for &wrt_id in wrt {
            if let Some(&grad_id) = self.map.get(&wrt_id) {
                let grad_node = self.grad.get_node(grad_id)?.clone();
                let grad_shape = grad_node.shape.as_ref().unwrap();
                let param_shape = self.src.get_node(wrt_id)?.shape.as_ref().unwrap();

                if grad_shape != param_shape {
                    let param_as_external = self.import(wrt_id)?;
                    let final_grad = self
                        .grad
                        .add_node(None, NodeType::ReduceSumTo(grad_id, param_as_external));
                    self.map.insert(wrt_id, final_grad);
                }
            }
        }

        // 6. Collect final outputs
        let final_outputs: Vec<_> = wrt
            .iter()
            .map(|&n| self.get_or_zero(n))
            .collect::<Result<_, _>>()?;
        self.grad.set_outputs(final_outputs);

        // 7. Run ShapeInference one last time to process new ReduceSumTo nodes
        ShapeInference::run(&mut self.grad, &initial_shapes)?;

        Ok(self.grad)
    }

    // ---------- internal helper methods ----------

    fn ext_name(&self, src_id: NodeId) -> String {
        format!("external_{}_{}", self.src.id, src_id)
    }

    fn lit_scalar(&mut self, v: f32) -> NodeId {
        let arr = ndarray::arr0(v).into_dyn();
        self.grad
            .add_node(None, NodeType::Literal(Value::Tensor(arr)))
    }

    /// Import a node from src as External in grad.
    fn import(&mut self, src_id: NodeId) -> Result<NodeId, AutogradError> {
        let name = self.ext_name(src_id);
        if let Some(node) = self
            .grad
            .nodes
            .values()
            .find(|n| n.name.as_deref() == Some(&name))
        {
            return Ok(node.id);
        }

        let node = self.src.get_node(src_id)?;
        let new_id = self.grad.add_node(
            Some(name.clone()),
            NodeType::External {
                name,
                source_asg_id: self.src.id,
                source_node_id: src_id,
            },
        );
        let n = self.grad.get_node_mut(new_id)?;
        n.shape = node.shape.clone();
        n.dtype = node.dtype;
        Ok(new_id)
    }

    /// Get the gradient for a node; if not present, return zero of appropriate shape.
    fn get_or_zero(&mut self, src_id: NodeId) -> Result<NodeId, AutogradError> {
        if let Some(&g) = self.map.get(&src_id) {
            return Ok(g);
        }
        let node = self.src.get_node(src_id)?;
        let shape = node.shape.as_ref().ok_or_else(|| {
            ShapeInferenceError::MissingInitialShape(node.name.clone().unwrap_or_default())
        })?;
        let zeros = ndarray::ArrayD::zeros(shape.clone());
        let id = self.grad.add_node(
            Some(format!("zero_grad_{}", src_id)),
            NodeType::Literal(Value::Tensor(zeros)),
        );
        let n = self.grad.get_node_mut(id)?;
        n.shape = Some(shape.clone());
        n.dtype = node.dtype;
        Ok(id)
    }

    /// Simple gradient accumulation function. Broadcast checking is factored out.
    fn acc(&mut self, src_id: NodeId, delta: NodeId) -> Result<(), AutogradError> {
        let current_grad = self.map.get(&src_id).copied();
        let new_grad = if let Some(g) = current_grad {
            self.grad.add_node(None, NodeType::Add(g, delta))
        } else {
            delta
        };
        self.map.insert(src_id, new_grad);
        Ok(())
    }

    /// Main backward method for a single node.
    fn backward_node(&mut self, node: NodeId, g_out: NodeId) -> Result<(), AutogradError> {
        let n = self.src.get_node(node)?.clone();
        match &n.node_type {
            NodeType::Input { .. }
            | NodeType::Parameter { .. }
            | NodeType::Literal(_)
            | NodeType::External { .. } => {}

            NodeType::Add(a, b) => {
                self.acc(*a, g_out)?;
                self.acc(*b, g_out)?;
            }
            NodeType::Subtract(a, b) => {
                let minus_one = self.lit_scalar(-1.0);
                let g_b = self
                    .grad
                    .add_node(None, NodeType::Multiply(g_out, minus_one));
                self.acc(*a, g_out)?;
                self.acc(*b, g_b)?;
            }
            NodeType::Multiply(a, b) => {
                let a_node = self.import(*a)?;
                let b_node = self.import(*b)?;
                let g_a = self.grad.add_node(None, NodeType::Multiply(g_out, b_node));
                let g_b = self.grad.add_node(None, NodeType::Multiply(g_out, a_node));
                self.acc(*a, g_a)?;
                self.acc(*b, g_b)?;
            }
            NodeType::Divide(a, b) => {
                let a_node = self.import(*a)?;
                let b_node = self.import(*b)?;
                let g_a = self.grad.add_node(None, NodeType::Divide(g_out, b_node));
                let num = self.grad.add_node(None, NodeType::Multiply(g_out, a_node));
                let b2 = self.grad.add_node(None, NodeType::Multiply(b_node, b_node));
                let gb_num = self.grad.add_node(None, NodeType::Divide(num, b2));
                let minus_one = self.lit_scalar(-1.0);
                let g_b = self
                    .grad
                    .add_node(None, NodeType::Multiply(gb_num, minus_one));
                self.acc(*a, g_a)?;
                self.acc(*b, g_b)?;
            }
            NodeType::MatrixMultiply(a, b) => {
                let a_node = self.import(*a)?;
                let b_node = self.import(*b)?;
                let a_shape = self.src.get_node(*a)?.shape.as_ref().unwrap();
                let b_shape = self.src.get_node(*b)?.shape.as_ref().unwrap();
                let a_rank = a_shape.len();
                let b_rank = b_shape.len();
                let b_t = self
                    .grad
                    .add_node(None, NodeType::Transpose(b_node, b_rank - 2, b_rank - 1));
                let g_a = self
                    .grad
                    .add_node(None, NodeType::MatrixMultiply(g_out, b_t));
                let a_t = self
                    .grad
                    .add_node(None, NodeType::Transpose(a_node, a_rank - 2, a_rank - 1));
                let g_b = self
                    .grad
                    .add_node(None, NodeType::MatrixMultiply(a_t, g_out));
                self.acc(*a, g_a)?;
                self.acc(*b, g_b)?;
            }
            NodeType::Mean(x) => {
                // Mean along last axis: mean(x) = sum(x, axis=-1) / n
                // Gradient: d_x = g_out / n (broadcast along last axis)
                let shape = self.src.get_node(*x)?.shape.as_ref().unwrap();
                let n = *shape.last().unwrap_or(&1) as f32;
                let scale = self.lit_scalar(1.0 / n);

                // g_out has shape with last axis = 1, need to broadcast to original shape of x
                let x_node = self.import(*x)?;
                let g_bcast = self.grad.add_node(None, NodeType::Broadcast(g_out, x_node));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_bcast, scale));
                self.acc(*x, g_x)?;
            }
            // REMOVED: special handling for Variance is no longer needed
            NodeType::Variance(_) => {
                return Err(AutogradError::Asg(AsgError::InputNotFound(
                    "Variance autograd is handled by decomposition".to_string(),
                )));
            }
            NodeType::Sqrt(x) => {
                let sqrt_x = self.import(node)?;
                let half = self.lit_scalar(0.5);
                let num = self.grad.add_node(None, NodeType::Multiply(half, g_out));
                let g_x = self.grad.add_node(None, NodeType::Divide(num, sqrt_x));
                self.acc(*x, g_x)?;
            }
            NodeType::ReLU(x) => {
                let x_node = self.import(*x)?;
                let zero = self.lit_scalar(0.0);
                let mask = self
                    .grad
                    .add_node(None, NodeType::GreaterThan(x_node, zero));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, mask));
                self.acc(*x, g_x)?;
            }
            NodeType::Sum(x) => {
                let x_node = self.import(*x)?;
                let g_x = self.grad.add_node(None, NodeType::Broadcast(g_out, x_node));
                self.acc(*x, g_x)?;
            }
            NodeType::Softmax(x) => {
                let s = self.import(node)?;
                let prod = self.grad.add_node(None, NodeType::Multiply(g_out, s));
                let sum_prod = self.grad.add_node(None, NodeType::Sum(prod));
                let bcast_sum = self
                    .grad
                    .add_node(None, NodeType::Broadcast(sum_prod, g_out));
                let sub = self
                    .grad
                    .add_node(None, NodeType::Subtract(g_out, bcast_sum));
                let g_x = self.grad.add_node(None, NodeType::Multiply(sub, s));
                self.acc(*x, g_x)?;
            }
            NodeType::Transpose(x, ax1, ax2) => {
                let g_x = self
                    .grad
                    .add_node(None, NodeType::Transpose(g_out, *ax1, *ax2));
                self.acc(*x, g_x)?;
            }
            NodeType::Reshape(data, _) => {
                let data_node_src = self.src.get_node(*data)?;
                let original_shape = data_node_src.shape.as_ref().unwrap();
                let shape_data_f32: Vec<f32> = original_shape.iter().map(|&d| d as f32).collect();
                let shape_array = ndarray::ArrayD::from_shape_vec(
                    ndarray::IxDyn(&[original_shape.len()]),
                    shape_data_f32,
                )
                .unwrap();
                let shape_node_grad = self
                    .grad
                    .add_node(None, NodeType::Literal(Value::Tensor(shape_array)));
                let g_x = self
                    .grad
                    .add_node(None, NodeType::Reshape(g_out, shape_node_grad));
                self.acc(*data, g_x)?;
            }
            NodeType::MaxPool2d {
                input,
                kernel_size,
                stride,
            } => {
                // Gradient flows back through max positions
                let input_node = self.import(*input)?;
                let g_input = self.grad.add_node(
                    None,
                    NodeType::MaxUnpool2d {
                        input: g_out,
                        original_input: input_node,
                        kernel_size: *kernel_size,
                        stride: *stride,
                    },
                );
                self.acc(*input, g_input)?;
            }
            NodeType::Sigmoid(x) => {
                // d/dx sigmoid(x) = sigmoid(x) * (1 - sigmoid(x))
                let sig_x = self.import(node)?;
                let one = self.lit_scalar(1.0);
                let one_minus_sig = self.grad.add_node(None, NodeType::Subtract(one, sig_x));
                let sig_deriv = self
                    .grad
                    .add_node(None, NodeType::Multiply(sig_x, one_minus_sig));
                let g_x = self
                    .grad
                    .add_node(None, NodeType::Multiply(g_out, sig_deriv));
                self.acc(*x, g_x)?;
            }
            NodeType::Tanh(x) => {
                // d/dx tanh(x) = 1 - tanh(x)^2
                let tanh_x = self.import(node)?;
                let one = self.lit_scalar(1.0);
                let tanh_sq = self.grad.add_node(None, NodeType::Multiply(tanh_x, tanh_x));
                let deriv = self.grad.add_node(None, NodeType::Subtract(one, tanh_sq));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, deriv));
                self.acc(*x, g_x)?;
            }
            NodeType::Exp(x) => {
                // d/dx exp(x) = exp(x)
                let exp_x = self.import(node)?;
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, exp_x));
                self.acc(*x, g_x)?;
            }
            NodeType::Log(x) => {
                // d/dx log(x) = 1/x
                let x_node = self.import(*x)?;
                let g_x = self.grad.add_node(None, NodeType::Divide(g_out, x_node));
                self.acc(*x, g_x)?;
            }
            NodeType::Neg(x) => {
                // d/dx (-x) = -1
                let minus_one = self.lit_scalar(-1.0);
                let g_x = self
                    .grad
                    .add_node(None, NodeType::Multiply(g_out, minus_one));
                self.acc(*x, g_x)?;
            }
            NodeType::LeakyReLU(x, slope) => {
                // d/dx leaky_relu(x) = 1 if x > 0 else slope
                let x_node = self.import(*x)?;
                let zero = self.lit_scalar(0.0);
                let one = self.lit_scalar(1.0);
                let slope_lit = self.lit_scalar(*slope);
                let mask = self
                    .grad
                    .add_node(None, NodeType::GreaterThan(x_node, zero));
                // deriv = mask * 1.0 + (1 - mask) * slope
                let one_minus_mask_lit = self.lit_scalar(1.0);
                let one_minus_mask = self
                    .grad
                    .add_node(None, NodeType::Subtract(one_minus_mask_lit, mask));
                let part1 = self.grad.add_node(None, NodeType::Multiply(mask, one));
                let part2 = self
                    .grad
                    .add_node(None, NodeType::Multiply(one_minus_mask, slope_lit));
                let deriv = self.grad.add_node(None, NodeType::Add(part1, part2));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, deriv));
                self.acc(*x, g_x)?;
            }
            NodeType::Abs(x) => {
                // d/dx |x| = sign(x) = 2*mask(x>0) - 1
                let x_node = self.import(*x)?;
                let zero = self.lit_scalar(0.0);
                let one = self.lit_scalar(1.0);
                let mask = self
                    .grad
                    .add_node(None, NodeType::GreaterThan(x_node, zero));
                // sign = mask * 1 + (1 - mask) * (-1) = 2*mask - 1
                let two = self.lit_scalar(2.0);
                let two_mask = self.grad.add_node(None, NodeType::Multiply(two, mask));
                let sign = self.grad.add_node(None, NodeType::Subtract(two_mask, one));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, sign));
                self.acc(*x, g_x)?;
            }
            NodeType::ELU(x, alpha) => {
                // d/dx ELU(x) = 1 if x > 0 else alpha * exp(x)
                let x_node = self.import(*x)?;
                let zero = self.lit_scalar(0.0);
                let one = self.lit_scalar(1.0);
                let alpha_lit = self.lit_scalar(*alpha);
                let mask = self
                    .grad
                    .add_node(None, NodeType::GreaterThan(x_node, zero));
                let exp_x = self.grad.add_node(None, NodeType::Exp(x_node));
                let alpha_exp = self
                    .grad
                    .add_node(None, NodeType::Multiply(alpha_lit, exp_x));
                // deriv = mask * 1 + (1 - mask) * alpha * exp(x)
                let one_minus_mask_lit = self.lit_scalar(1.0);
                let one_minus_mask = self
                    .grad
                    .add_node(None, NodeType::Subtract(one_minus_mask_lit, mask));
                let part1 = self.grad.add_node(None, NodeType::Multiply(mask, one));
                let part2 = self
                    .grad
                    .add_node(None, NodeType::Multiply(one_minus_mask, alpha_exp));
                let deriv = self.grad.add_node(None, NodeType::Add(part1, part2));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, deriv));
                self.acc(*x, g_x)?;
            }
            NodeType::Softplus(x, beta) => {
                // Softplus(x) = log(1 + exp(beta*x)) / beta
                // d/dx Softplus(x) = sigmoid(beta*x)
                let x_node = self.import(*x)?;
                let beta_lit = self.lit_scalar(*beta);
                let beta_x = self
                    .grad
                    .add_node(None, NodeType::Multiply(beta_lit, x_node));
                let sig = self.grad.add_node(None, NodeType::Sigmoid(beta_x));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, sig));
                self.acc(*x, g_x)?;
            }
            NodeType::GELU(x) => {
                // GELU(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
                // Derivative is complex, using numerical approximation:
                // Simplified: GELU'(x) ≈ sigmoid(1.702*x)
                let x_node = self.import(*x)?;
                let coef = self.lit_scalar(1.702);
                let scaled_x = self.grad.add_node(None, NodeType::Multiply(coef, x_node));
                let sig = self.grad.add_node(None, NodeType::Sigmoid(scaled_x));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, sig));
                self.acc(*x, g_x)?;
            }
            NodeType::SiLU(x) => {
                // SiLU(x) = x * sigmoid(x)
                // d/dx SiLU(x) = sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))
                //             = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
                // Simplified: SiLU'(x) = sigmoid(x) + SiLU(x) * (1 - sigmoid(x))
                let x_node = self.import(*x)?;
                let silu_x = self.import(node)?;
                let sig_x = self.grad.add_node(None, NodeType::Sigmoid(x_node));
                let one = self.lit_scalar(1.0);
                let one_minus_sig = self.grad.add_node(None, NodeType::Subtract(one, sig_x));
                let silu_times_oneminus = self
                    .grad
                    .add_node(None, NodeType::Multiply(silu_x, one_minus_sig));
                let deriv = self
                    .grad
                    .add_node(None, NodeType::Add(sig_x, silu_times_oneminus));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, deriv));
                self.acc(*x, g_x)?;
            }
            NodeType::Power(base, power) => {
                // d/dx x^n = n * x^(n-1)
                // Here power is a scalar (constant)
                let base_node = self.import(*base)?;
                let power_node = self.import(*power)?;
                // grad = power * base^(power-1) * g_out
                let one = self.lit_scalar(1.0);
                let power_minus_one = self
                    .grad
                    .add_node(None, NodeType::Subtract(power_node, one));
                let base_pow = self
                    .grad
                    .add_node(None, NodeType::Power(base_node, power_minus_one));
                let scaled = self
                    .grad
                    .add_node(None, NodeType::Multiply(power_node, base_pow));
                let g_base = self.grad.add_node(None, NodeType::Multiply(g_out, scaled));
                self.acc(*base, g_base)?;
                // Gradient w.r.t. power not supported (usually power is a constant)
            }
            NodeType::Embedding { indices, weight } => {
                // Gradient w.r.t. indices is undefined (discrete indices)
                // Gradient w.r.t. weight is scatter-add operation
                let weight_node = self.src.get_node(*weight)?;
                let weight_shape = weight_node.shape.as_ref().unwrap();
                let num_embeddings = weight_shape[0];

                let indices_node = self.import(*indices)?;
                let g_weight = self.grad.add_node(
                    None,
                    NodeType::EmbeddingGrad {
                        grad_output: g_out,
                        indices: indices_node,
                        num_embeddings,
                    },
                );
                self.acc(*weight, g_weight)?;
            }
            NodeType::Clamp(x, min_val, max_val) => {
                // d/dx clamp(x, min, max) = 1 if min < x < max else 0
                // mask = (x > min) AND (x < max)
                let x_node = self.import(*x)?;
                let min_lit = self.lit_scalar(*min_val);
                let max_lit = self.lit_scalar(*max_val);
                let gt_min = self
                    .grad
                    .add_node(None, NodeType::GreaterThan(x_node, min_lit));
                // x < max => NOT(x > max) => 1 - (x > max)
                let gt_max = self
                    .grad
                    .add_node(None, NodeType::GreaterThan(x_node, max_lit));
                let one = self.lit_scalar(1.0);
                let lt_max = self.grad.add_node(None, NodeType::Subtract(one, gt_max));
                let mask = self.grad.add_node(None, NodeType::Multiply(gt_min, lt_max));
                let g_x = self.grad.add_node(None, NodeType::Multiply(g_out, mask));
                self.acc(*x, g_x)?;
            }
            NodeType::AvgPool2d {
                input,
                kernel_size,
                stride,
                padding,
            } => {
                // Gradient distributes uniformly across the pooling window
                let input_node = self.import(*input)?;
                let g_input = self.grad.add_node(
                    None,
                    NodeType::AvgUnpool2d {
                        input: g_out,
                        original_input: input_node,
                        kernel_size: *kernel_size,
                        stride: *stride,
                        padding: *padding,
                    },
                );
                self.acc(*input, g_input)?;
            }
            NodeType::Conv2d {
                input,
                weight,
                bias,
                stride,
                padding,
                dilation,
                groups,
            } => {
                // Get input and weight shapes from source graph
                let input_node_src = self.src.get_node(*input)?;
                let weight_node_src = self.src.get_node(*weight)?;

                let input_shape = input_node_src.shape.as_ref().ok_or({
                    AutogradError::Shape(ShapeInferenceError::MissingShapeInfo(*input))
                })?;
                let weight_shape = weight_node_src.shape.as_ref().ok_or({
                    AutogradError::Shape(ShapeInferenceError::MissingShapeInfo(*weight))
                })?;

                // Convert shapes to tuples
                let input_shape_tuple = (
                    input_shape[0],
                    input_shape[1],
                    input_shape[2],
                    input_shape[3],
                );
                let weight_shape_tuple = (
                    weight_shape[0],
                    weight_shape[1],
                    weight_shape[2],
                    weight_shape[3],
                );

                // Import input and weight nodes for gradient computation
                let input_node = self.import(*input)?;
                let weight_node = self.import(*weight)?;

                // Gradient w.r.t. input: transposed convolution
                let g_input = self.grad.add_node(
                    None,
                    NodeType::Conv2dBackwardInput {
                        grad_output: g_out,
                        weight: weight_node,
                        input_shape: input_shape_tuple,
                        stride: *stride,
                        padding: *padding,
                        dilation: *dilation,
                        groups: *groups,
                    },
                );
                self.acc(*input, g_input)?;

                // Gradient w.r.t. weight
                let g_weight = self.grad.add_node(
                    None,
                    NodeType::Conv2dBackwardWeight {
                        grad_output: g_out,
                        input: input_node,
                        weight_shape: weight_shape_tuple,
                        stride: *stride,
                        padding: *padding,
                        dilation: *dilation,
                        groups: *groups,
                    },
                );
                self.acc(*weight, g_weight)?;

                // Gradient w.r.t. bias (if present): sum over batch and spatial dimensions
                if let Some(b) = bias {
                    // Bias gradient is sum of grad_output over (N, H, W), keeping C
                    // For simplicity, we use Sum then reshape
                    // grad_bias = sum(grad_output, axis=[0, 2, 3])
                    let g_bias = self.grad.add_node(None, NodeType::Sum(g_out));
                    self.acc(*b, g_bias)?;
                }
            }
            NodeType::LayerNorm {
                input,
                gamma,
                beta,
                eps,
            } => {
                // Import needed nodes from forward graph
                let input_node = self.import(*input)?;
                let gamma_node = self.import(*gamma)?;

                // Gradient w.r.t. input: use specialized LayerNormBackward operation
                let g_input = self.grad.add_node(
                    None,
                    NodeType::LayerNormBackward {
                        grad_output: g_out,
                        input: input_node,
                        gamma: gamma_node,
                        eps: *eps,
                    },
                );
                self.acc(*input, g_input)?;

                // Gradient w.r.t. gamma: use specialized LayerNormGradGamma operation
                let g_gamma = self.grad.add_node(
                    None,
                    NodeType::LayerNormGradGamma {
                        grad_output: g_out,
                        input: input_node,
                        eps: *eps,
                    },
                );
                self.acc(*gamma, g_gamma)?;

                // Gradient w.r.t. beta: use specialized LayerNormGradBeta operation
                let g_beta = self
                    .grad
                    .add_node(None, NodeType::LayerNormGradBeta { grad_output: g_out });
                self.acc(*beta, g_beta)?;
            }

            NodeType::BatchNorm {
                input,
                gamma,
                beta,
                eps,
                channel_axis,
            } => {
                let input_node = self.import(*input)?;
                let gamma_node = self.import(*gamma)?;

                let g_input = self.grad.add_node(
                    None,
                    NodeType::BatchNormBackward {
                        grad_output: g_out,
                        input: input_node,
                        gamma: gamma_node,
                        eps: *eps,
                        channel_axis: *channel_axis,
                    },
                );
                self.acc(*input, g_input)?;

                let g_gamma = self.grad.add_node(
                    None,
                    NodeType::BatchNormGradGamma {
                        grad_output: g_out,
                        input: input_node,
                        eps: *eps,
                        channel_axis: *channel_axis,
                    },
                );
                self.acc(*gamma, g_gamma)?;

                let g_beta = self.grad.add_node(
                    None,
                    NodeType::BatchNormGradBeta {
                        grad_output: g_out,
                        channel_axis: *channel_axis,
                    },
                );
                self.acc(*beta, g_beta)?;
            }

            NodeType::Slice {
                input, axis, start, ..
            } => {
                // dL/dx is g_out zero-padded back to input's shape along `axis`.
                let input_shape = self.src.get_node(*input)?.shape.as_ref().ok_or_else(|| {
                    AutogradError::Asg(AsgError::InvalidGraph(format!(
                        "Slice backward: input node {} has no shape",
                        input
                    )))
                })?;
                let full_size = input_shape[*axis];
                let g_input = self.grad.add_node(
                    None,
                    NodeType::SliceBackward {
                        grad_output: g_out,
                        axis: *axis,
                        start: *start,
                        full_size,
                    },
                );
                self.acc(*input, g_input)?;
            }

            NodeType::Concat { inputs, axis } => {
                // dL/dx_i is slice of g_out along `axis` corresponding to x_i's range.
                let mut offset = 0usize;
                let input_ids: Vec<NodeId> = inputs.clone();
                let axis_val = *axis;
                for input_id in input_ids {
                    let input_shape =
                        self.src.get_node(input_id)?.shape.as_ref().ok_or_else(|| {
                            AutogradError::Asg(AsgError::InvalidGraph(format!(
                                "Concat backward: input node {} has no shape",
                                input_id
                            )))
                        })?;
                    let width = input_shape[axis_val];
                    let g_slice = self.grad.add_node(
                        None,
                        NodeType::Slice {
                            input: g_out,
                            axis: axis_val,
                            start: offset,
                            end: offset + width,
                        },
                    );
                    self.acc(input_id, g_slice)?;
                    offset += width;
                }
            }

            NodeType::DropoutMask { .. } => {
                // The mask is a sampled random variable, not differentiable
                // w.r.t. the shape provider. The actual gradient flow goes
                // through the `Multiply(input, mask)` node instead — the
                // backward rule for `Multiply` already routes `grad * mask`
                // back to `input`, which is what we want.
            }

            NodeType::MeanAxis { input, axis, .. } => {
                // dL/dx[..., k, ...] = grad_y[..., ...] / N, broadcast along axis.
                let input_shape = self.src.get_node(*input)?.shape.as_ref().ok_or_else(|| {
                    AutogradError::Asg(AsgError::InvalidGraph(format!(
                        "MeanAxis backward: input node {} has no shape",
                        input
                    )))
                })?;
                let n = input_shape[*axis] as f32;
                let scale = self.lit_scalar(1.0 / n);
                // Use the original `input` (imported as External) as the
                // broadcast target — its shape is exactly what we need.
                let input_ext = self.import(*input)?;
                let scaled = self.grad.add_node(None, NodeType::Multiply(g_out, scale));
                let broadcast = self
                    .grad
                    .add_node(None, NodeType::Broadcast(scaled, input_ext));
                self.acc(*input, broadcast)?;
            }

            NodeType::VarianceAxis { .. } => {
                // VarianceAxis is currently used inside specialised BatchNorm
                // nodes that have their own backward rules; standalone use
                // would require a chain-rule implementation we don't yet need.
            }

            _ => {}
        }
        Ok(())
    }

    fn topo(&self, start: NodeId) -> Result<Vec<NodeId>, AutogradError> {
        let mut vis = HashSet::new();
        let mut order = Vec::new();
        self.dfs(start, &mut vis, &mut order)?;
        Ok(order)
    }

    fn dfs(
        &self,
        id: NodeId,
        vis: &mut HashSet<NodeId>,
        order: &mut Vec<NodeId>,
    ) -> Result<(), AutogradError> {
        if !vis.insert(id) {
            return Ok(());
        }
        let node = self.src.get_node(id)?;
        for inp in inputs_of(&node.node_type) {
            self.dfs(inp, vis, order)?;
        }
        order.push(id);
        Ok(())
    }
}

fn inputs_of(nt: &NodeType) -> Vec<NodeId> {
    match nt {
        NodeType::Add(a, b)
        | NodeType::Subtract(a, b)
        | NodeType::Multiply(a, b)
        | NodeType::Divide(a, b)
        | NodeType::MatrixMultiply(a, b)
        | NodeType::GreaterThan(a, b)
        | NodeType::Power(a, b)
        | NodeType::Broadcast(a, b)
        | NodeType::Reshape(a, b)
        | NodeType::ReduceSumTo(a, b) => vec![*a, *b],
        NodeType::ReLU(a)
        | NodeType::Sum(a)
        | NodeType::Sigmoid(a)
        | NodeType::Softmax(a)
        | NodeType::Mean(a)
        | NodeType::Variance(a)
        | NodeType::Sqrt(a)
        | NodeType::Log(a)
        | NodeType::Exp(a)
        | NodeType::Neg(a)
        | NodeType::Abs(a)
        | NodeType::Tanh(a)
        | NodeType::GELU(a)
        | NodeType::SiLU(a)
        | NodeType::Transpose(a, ..) => vec![*a],
        NodeType::LeakyReLU(a, _)
        | NodeType::ELU(a, _)
        | NodeType::Softplus(a, _)
        | NodeType::Clamp(a, _, _) => vec![*a],
        NodeType::MaxPool2d { input, .. } => vec![*input],
        NodeType::MaxUnpool2d {
            input,
            original_input,
            ..
        } => vec![*input, *original_input],
        NodeType::Conv2d {
            input,
            weight,
            bias,
            ..
        } => {
            let mut deps = vec![*input, *weight];
            if let Some(b) = bias {
                deps.push(*b);
            }
            deps
        }
        NodeType::ConvTranspose2d {
            input,
            weight,
            bias,
            ..
        } => {
            let mut deps = vec![*input, *weight];
            if let Some(b) = bias {
                deps.push(*b);
            }
            deps
        }
        NodeType::AvgPool2d { input, .. } => vec![*input],
        NodeType::AdaptiveAvgPool2d { input, .. } => vec![*input],
        NodeType::Embedding { indices, weight } => vec![*indices, *weight],
        NodeType::EmbeddingGrad {
            grad_output,
            indices,
            ..
        } => vec![*grad_output, *indices],
        NodeType::AvgUnpool2d {
            input,
            original_input,
            ..
        } => vec![*input, *original_input],
        NodeType::Conv2dBackwardInput {
            grad_output,
            weight,
            ..
        } => vec![*grad_output, *weight],
        NodeType::Conv2dBackwardWeight {
            grad_output, input, ..
        } => vec![*grad_output, *input],
        NodeType::LayerNorm {
            input, gamma, beta, ..
        } => vec![*input, *gamma, *beta],
        NodeType::LayerNormBackward {
            grad_output,
            input,
            gamma,
            ..
        } => vec![*grad_output, *input, *gamma],
        NodeType::LayerNormGradGamma {
            grad_output, input, ..
        } => vec![*grad_output, *input],
        NodeType::LayerNormGradBeta { grad_output } => vec![*grad_output],
        NodeType::Slice { input, .. } => vec![*input],
        NodeType::Concat { inputs, .. } => inputs.clone(),
        NodeType::SliceBackward { grad_output, .. } => vec![*grad_output],
        NodeType::DropoutMask { shape_provider, .. } => vec![*shape_provider],
        NodeType::MeanAxis { input, .. } | NodeType::VarianceAxis { input, .. } => {
            vec![*input]
        }
        NodeType::BatchNorm {
            input, gamma, beta, ..
        } => vec![*input, *gamma, *beta],
        NodeType::BatchNormBackward {
            grad_output,
            input,
            gamma,
            ..
        } => vec![*grad_output, *input, *gamma],
        NodeType::BatchNormGradGamma {
            grad_output, input, ..
        } => vec![*grad_output, *input],
        NodeType::BatchNormGradBeta { grad_output, .. } => vec![*grad_output],
        _ => vec![],
    }
}