ferrotorch-nn 0.6.1

//! `Module<T>` trait, `Reduction`, and `StateDict<T>` — the Rust analog of
//! PyTorch's `torch.nn.Module` base class.
//!
//! Every neural-network layer in `ferrotorch-nn` implements `Module<T>`. The
//! trait composes parameter iteration, buffer iteration, train/eval mode,
//! device transfer, sub-module walks, hook registration, gradient zeroing,
//! and state-dict load/save.
//!
//! ## REQ status (per `.design/ferrotorch-nn/module.md`)
//!
//! | REQ | Status | Evidence |
//! |---|---|---|
//! | REQ-1 | SHIPPED | `pub type StateDict<T> = HashMap<String, Tensor<T>>` mirrors `torch/nn/modules/module.py:1980-2060`; consumed by `pub use module::{Module, Reduction, StateDict}` at `lib.rs:218` and every layer's `state_dict()` return. |
//! | REQ-2 | SHIPPED | `pub enum Reduction { Mean, Sum, None }` mirrors PyTorch's loss `reduction` kwarg; consumed by `ferrotorch-nn/src/loss.rs:19` `use crate::module::Reduction` and `ferrotorch-nn/src/functional.rs:1798`. |
//! | REQ-3 | SHIPPED | `pub trait Module<T: Float>: Send + Sync` mirrors `torch/nn/modules/module.py:407-441`; consumed by every layer file (`linear.rs`, `conv.rs`, `norm.rs`, `embedding.rs`, etc.) via `impl Module<T> for ...`. |
//! | REQ-4 | SHIPPED | `forward`, `parameters`, `parameters_mut`, `named_parameters` required methods; consumed by `ferrotorch-optim/src/optimizer.rs:5` and every optimizer (`adam.rs:17`, `adadelta.rs:20`, …) reading `.parameters()`. |
//! | REQ-5 | SHIPPED | `train`, `eval`, `is_training` required methods; consumed by `ferrotorch-nn/src/container.rs` `Sequential::train` / `::eval` propagation. |
//! | REQ-6 | SHIPPED | Default `to_device` iterates parameters and buffers, mirroring `torch/nn/modules/module.py:1180-1260`; consumed by downstream `model.to_device(Device::Cuda(0))` calls in model composition code. |
//! | REQ-7 | SHIPPED | Default `state_dict` unions parameters with buffers, mirroring `module.py:1980-2060`; consumed by `ferrotorch-nn/src/hooks.rs` `HookedModule::state_dict` delegation and SafeTensors export. |
//! | REQ-8 | SHIPPED | `load_state_dict(strict)` two-pass strict + shape-validate, mirroring `module.py:2150-2310`; consumed by SafeTensors / GGUF loaders in downstream crates. |
//! | REQ-9 | SHIPPED | `buffers` / `buffers_mut` / `named_buffers` default-empty methods mirror `module.py:2430-2490`; consumed by `ferrotorch-nn/src/norm.rs` `BatchNorm*` overrides and `module.rs` default `load_state_dict` (the `*buf = Buffer::new(...)` site). |
//! | REQ-10 | SHIPPED | `as_any` default returns `None` for the #984 downcast hook; consumed by `ferrotorch-nn/src/norm.rs` `BatchNorm*` overriding it so `ferrotorch-vision`'s state-dict loader can route to BN running stats. |
//! | REQ-11 | SHIPPED | `children`, `named_children`, `modules` (`Self: Sized`), `descendants_dyn` (object-safe), `named_modules`, `named_descendants_dyn` mirror `module.py:2510-2640`; consumed by `ferrotorch-nn/src/container.rs` containers and downstream state-dict walkers. |
//! | REQ-12 | SHIPPED | Empty-parent path branch in `named_descendants_dyn` fixes #1142 (DeepLabV3 BN-buffer routing); consumed by `ferrotorch-vision`'s DeepLabV3 backbone load path; pinned by `module_named_descendants_dyn_empty_parent_no_leading_dot`. |
//! | REQ-13 | SHIPPED | `with_forward_hook` / `with_forward_pre_hook` / `with_backward_hook` `Self: Sized` methods return `(HookedModule<Self, T>, HookHandle)`; consumed by `ferrotorch-nn/src/hooks.rs` (`HookedModule` is instantiated by these) and downstream observability code. |
//! | REQ-14 | SHIPPED | Default `zero_grad` walks `parameters()` mirroring `module.py:2700-2740`; consumed by `ferrotorch-optim` and `ferrotorch-train/src/grad_utils.rs` calling `model.zero_grad()` per step. |
//! | REQ-15 | SHIPPED | Default `requires_grad_(bool)` toggles all parameters mirroring `module.py:2680-2700`; consumed by transfer-learning callsites that freeze the backbone via `backbone.requires_grad_(false)`. |
//! | REQ-16 | SHIPPED | Default `apply_to_parameters(&mut dyn FnMut(...))` mirrors `torch.nn.Module.apply` for the parameter case; consumed by lazy-init paths in downstream lazy layers (`lazy_linear.rs`, `lazy_conv.rs`). |

use std::collections::HashMap;

use ferrotorch_core::{Device, FerrotorchError, FerrotorchResult, Float, Tensor};

use crate::buffer::Buffer;
use crate::hooks::{BackwardHook, ForwardHook, ForwardPreHook, HookHandle, HookedModule};
use crate::parameter::Parameter;

/// A map from parameter names to tensors, used for serialization.
pub type StateDict<T> = HashMap<String, Tensor<T>>;

/// Reduction mode for loss functions.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Reduction {
    /// Return the mean of all losses.
    Mean,
    /// Return the sum of all losses.
    Sum,
    /// Return the unreduced loss tensor.
    None,
}

/// The trait that all neural network layers implement.
///
/// Requires `Send + Sync` to match `Tensor<T>`'s thread-safety guarantees.
pub trait Module<T: Float>: Send + Sync {
    /// Forward pass. Takes input tensor, returns output tensor.
    fn forward(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>>;

    /// Iterate over all learnable parameters.
    fn parameters(&self) -> Vec<&Parameter<T>>;

    /// Iterate over all learnable parameters mutably.
    fn parameters_mut(&mut self) -> Vec<&mut Parameter<T>>;

    /// Named parameters for state dict serialization.
    ///
    /// Keys use dot-separated paths for nested modules
    /// (e.g., `"layer1.weight"`, `"layer1.bias"`).
    fn named_parameters(&self) -> Vec<(String, &Parameter<T>)>;

    /// Set training mode. Affects dropout, batchnorm, etc.
    fn train(&mut self);

    /// Set evaluation mode.
    fn eval(&mut self);

    /// Whether the module is in training mode.
    fn is_training(&self) -> bool;

    /// Move all parameters and buffers to a device.
    ///
    /// Default implementation iterates `parameters_mut()` and `buffers_mut()`
    /// and transfers each.
    fn to_device(&mut self, device: Device) -> FerrotorchResult<()> {
        for param in self.parameters_mut() {
            *param = param.to(device)?;
        }
        for buffer in self.buffers_mut() {
            *buffer = buffer.to(device)?;
        }
        Ok(())
    }

    /// Export parameters and buffers as a state dict (torch parity).
    ///
    /// Buffers are included alongside parameters since both are persistent
    /// module state. Keys are dot-separated paths.
    fn state_dict(&self) -> StateDict<T> {
        let mut out: StateDict<T> = self
            .named_parameters()
            .into_iter()
            .map(|(name, param)| (name, param.tensor().clone()))
            .collect();
        for (name, buffer) in self.named_buffers() {
            out.insert(name, buffer.tensor().clone());
        }
        out
    }

    // -----------------------------------------------------------------
    // Buffers — non-trainable persistent state. (#583)
    // -----------------------------------------------------------------

    /// Iterate over all non-trainable buffers (e.g. running mean / variance
    /// in BatchNorm). Default returns empty — concrete modules with buffers
    /// override.
    fn buffers(&self) -> Vec<&Buffer<T>> {
        Vec::new()
    }

    /// Mutable iteration over all buffers. Default returns empty.
    fn buffers_mut(&mut self) -> Vec<&mut Buffer<T>> {
        Vec::new()
    }

    /// Named buffers (dot-separated paths for nested modules). Default
    /// returns empty.
    fn named_buffers(&self) -> Vec<(String, &Buffer<T>)> {
        Vec::new()
    }

    /// Downcast hook for type-erased buffer-loader dispatch. (#984)
    ///
    /// Returns `Some(&self as &dyn Any)` for concrete module types whose
    /// non-`Buffer<T>` persistent state needs to be applied from a state
    /// dict (currently `BatchNorm1d` / `BatchNorm2d` / `BatchNorm3d`'s
    /// running mean / variance / `num_batches_tracked` — see Phase 2
    /// of the value-parity pipeline in `ferrotorch-vision/tests`).
    ///
    /// The default returns `None`, so existing modules are unaffected:
    /// type-erased callers walking `named_modules()` will simply skip
    /// modules that do not opt in. Implementors MUST return
    /// `Some(self)`; returning `Some` for an unrelated `Any` would
    /// violate the contract.
    ///
    /// Why a downcast hook instead of a wider trait surface (e.g. a
    /// dedicated `set_buffer_value(&self, &str, &Tensor<T>)` method on
    /// `Module`)? Buffers carrying torch-shaped state (running mean /
    /// variance, `num_batches_tracked: usize`) currently live outside
    /// the [`Buffer<T>`] abstraction (BN keeps `Mutex<Vec<f64>>` for
    /// numerical stability and the integer counter has no `Buffer`
    /// at all), so a single typed setter on `Module` would force a
    /// premature unification that #984 explicitly defers. The downcast
    /// hook keeps `Module` free of BN-specific shape and lets concrete
    /// modules expose their own typed setters at full precision.
    fn as_any(&self) -> Option<&dyn std::any::Any> {
        None
    }

    // -----------------------------------------------------------------
    // Submodule iteration. (#583)
    // -----------------------------------------------------------------

    /// Direct child modules. Default returns empty (leaf module).
    fn children(&self) -> Vec<&dyn Module<T>> {
        Vec::new()
    }

    /// Direct child modules with their attribute names. Default returns
    /// empty.
    fn named_children(&self) -> Vec<(String, &dyn Module<T>)> {
        Vec::new()
    }

    /// All modules in this subtree, depth-first (self first, then each
    /// child's descendants in order).
    ///
    /// Requires `Self: Sized` so we can coerce `self` to `&dyn Module<T>`.
    /// Trait-object callers can use [`Module::descendants_dyn`] (which yields
    /// descendants only) and prepend their own reference.
    fn modules(&self) -> Vec<&dyn Module<T>>
    where
        Self: Sized,
    {
        let mut out: Vec<&dyn Module<T>> = vec![self];
        out.extend(self.descendants_dyn());
        out
    }

    /// All strict descendants of `self` in depth-first order. Object-safe.
    fn descendants_dyn(&self) -> Vec<&dyn Module<T>> {
        let mut out: Vec<&dyn Module<T>> = Vec::new();
        for child in self.children() {
            out.push(child);
            out.extend(child.descendants_dyn());
        }
        out
    }

    /// All modules in this subtree with dot-separated path names. The root
    /// is named `""`; children paths are joined with `.`.
    fn named_modules(&self) -> Vec<(String, &dyn Module<T>)>
    where
        Self: Sized,
    {
        let mut out: Vec<(String, &dyn Module<T>)> = vec![(String::new(), self)];
        out.extend(self.named_descendants_dyn());
        out
    }

    /// Strict descendants with dot-paths. Object-safe.
    fn named_descendants_dyn(&self) -> Vec<(String, &dyn Module<T>)> {
        let mut out: Vec<(String, &dyn Module<T>)> = Vec::new();
        for (name, child) in self.named_children() {
            out.push((name.clone(), child));
            for (sub_name, sub_module) in child.named_descendants_dyn() {
                let full = if sub_name.is_empty() {
                    name.clone()
                } else if name.is_empty() {
                    // Transparent wrapper: parent exposes child under "" so
                    // the child's own naming (e.g. `("backbone", inner)`)
                    // becomes the canonical path. Without this branch the
                    // walker would produce `".backbone.X"` (leading dot),
                    // mismatching state-dict keys like `"backbone.X"`.
                    // See #1142 for the DeepLabV3 BN-buffer routing bug
                    // that this branch fixes.
                    sub_name
                } else {
                    format!("{name}.{sub_name}")
                };
                out.push((full, sub_module));
            }
        }
        out
    }

    // -----------------------------------------------------------------
    // Helpers. (#583)
    // -----------------------------------------------------------------

    // -----------------------------------------------------------------
    // Hooks (#606)
    //
    // These consume `self` and return a [`HookedModule<Self, T>`] with the
    // requested hook already registered. Mirrors `torch.nn.Module
    // .register_*_hook(...)` ergonomically — callers no longer need to
    // wrap manually with `HookedModule::new(..)` first. Gated on
    // `Self: Sized` so the trait stays dyn-compatible.
    //
    // Named with the `with_*` prefix (rather than `register_*` directly) to
    // avoid clashing with `HookedModule`'s own inherent `register_*` methods,
    // which take `&self` and append a hook to an already-wrapped instance.
    // The two surfaces compose: `Linear::new(..)?.with_forward_hook(h1).0`
    // is a `HookedModule` that can `.register_forward_hook(h2)` again.
    // -----------------------------------------------------------------

    /// Wrap this module in a [`HookedModule`] and register a forward hook.
    /// Returns the wrapper paired with a [`HookHandle`] that can be used to
    /// remove the hook later. The wrapper implements `Module<T>` itself, so
    /// it slots into any place the original module did. Mirrors
    /// `torch.nn.Module.register_forward_hook`.
    fn with_forward_hook(self, hook: ForwardHook<T>) -> (HookedModule<Self, T>, HookHandle)
    where
        Self: Sized,
    {
        let wrapped = HookedModule::new(self);
        let handle = wrapped.register_forward_hook(hook);
        (wrapped, handle)
    }

    /// Wrap this module in a [`HookedModule`] and register a forward
    /// pre-hook. See [`Self::with_forward_hook`]. Mirrors
    /// `torch.nn.Module.register_forward_pre_hook`.
    fn with_forward_pre_hook(self, hook: ForwardPreHook<T>) -> (HookedModule<Self, T>, HookHandle)
    where
        Self: Sized,
    {
        let wrapped = HookedModule::new(self);
        let handle = wrapped.register_forward_pre_hook(hook);
        (wrapped, handle)
    }

    /// Wrap this module in a [`HookedModule`] and register a backward hook.
    /// See [`Self::with_forward_hook`]. Mirrors
    /// `torch.nn.Module.register_backward_hook`.
    fn with_backward_hook(self, hook: BackwardHook<T>) -> (HookedModule<Self, T>, HookHandle)
    where
        Self: Sized,
    {
        let wrapped = HookedModule::new(self);
        let handle = wrapped.register_backward_hook(hook);
        (wrapped, handle)
    }

    /// Set the gradient of every parameter to `None`.
    ///
    /// Equivalent to calling `tensor.zero_grad()` on each parameter's
    /// underlying tensor. Mirrors `torch.nn.Module.zero_grad`.
    fn zero_grad(&self) -> FerrotorchResult<()> {
        for param in self.parameters() {
            param.tensor().zero_grad()?;
        }
        Ok(())
    }

    /// Toggle `requires_grad` on every parameter (freeze / unfreeze the
    /// module). Mirrors `torch.nn.Module.requires_grad_`.
    fn requires_grad_(&mut self, requires_grad: bool) {
        for param in self.parameters_mut() {
            param.set_requires_grad(requires_grad);
        }
    }

    /// Apply a function to every parameter in this module. Mirrors
    /// `torch.nn.Module.apply` for the parameter case (true `apply` recurses
    /// over all submodules; the recursive form requires `&mut dyn Module`
    /// which conflicts with this trait's `&mut self` borrow).
    ///
    /// Takes `&mut dyn FnMut(...)` (rather than a generic closure) so the
    /// trait stays dyn-compatible — `Box<dyn Module<T>>` is a common usage
    /// pattern.
    fn apply_to_parameters(&mut self, f: &mut dyn FnMut(&mut Parameter<T>)) {
        for param in self.parameters_mut() {
            f(param);
        }
    }

    /// Load parameters from a state dict.
    ///
    /// When `strict` is `true` (default), unexpected keys are an error.
    /// When `false`, unexpected keys are silently ignored and missing
    /// keys leave existing parameter values unchanged.
    fn load_state_dict(&mut self, state: &StateDict<T>, strict: bool) -> FerrotorchResult<()> {
        // Known keys: union of parameter and buffer paths.
        let mut known_keys: std::collections::HashSet<String> = self
            .named_parameters()
            .iter()
            .map(|(k, _)| k.clone())
            .collect();
        for (k, _) in self.named_buffers() {
            known_keys.insert(k);
        }

        if strict {
            for key in state.keys() {
                if !known_keys.contains(key) {
                    return Err(FerrotorchError::InvalidArgument {
                        message: format!("unexpected key in state_dict: \"{key}\""),
                    });
                }
            }
        }

        // We need mutable access to parameters. Use named_parameters to get
        // the mapping, then parameters_mut to actually update.
        // This two-pass approach avoids borrowing issues.
        let param_names: Vec<String> = self
            .named_parameters()
            .into_iter()
            .map(|(name, _)| name)
            .collect();

        let params_mut = self.parameters_mut();

        for (name, param) in param_names.iter().zip(params_mut) {
            if let Some(tensor) = state.get(name) {
                if param.shape() != tensor.shape() {
                    return Err(FerrotorchError::ShapeMismatch {
                        message: format!(
                            "state_dict shape mismatch for \"{name}\": expected {:?}, got {:?}",
                            param.shape(),
                            tensor.shape()
                        ),
                    });
                }
                // Replace the parameter data with the loaded tensor.
                *param = Parameter::new(tensor.clone());
            } else if strict {
                return Err(FerrotorchError::InvalidArgument {
                    message: format!("missing key in state_dict: \"{name}\""),
                });
            }
        }

        // Same dance for buffers.
        let buffer_names: Vec<String> = self
            .named_buffers()
            .into_iter()
            .map(|(name, _)| name)
            .collect();
        let buffers_mut = self.buffers_mut();
        for (name, buf) in buffer_names.iter().zip(buffers_mut) {
            if let Some(tensor) = state.get(name) {
                if buf.shape() != tensor.shape() {
                    return Err(FerrotorchError::ShapeMismatch {
                        message: format!(
                            "state_dict shape mismatch for buffer \"{name}\": expected {:?}, got {:?}",
                            buf.shape(),
                            tensor.shape()
                        ),
                    });
                }
                *buf = Buffer::new(tensor.clone());
            } else if strict {
                return Err(FerrotorchError::InvalidArgument {
                    message: format!("missing buffer key in state_dict: \"{name}\""),
                });
            }
        }

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    /// A minimal test module with one parameter.
    struct SimpleModule<T: Float> {
        weight: Parameter<T>,
        training: bool,
    }

    impl<T: Float> SimpleModule<T> {
        fn new(size: usize) -> FerrotorchResult<Self> {
            Ok(Self {
                weight: Parameter::zeros(&[size])?,
                training: true,
            })
        }
    }

    impl<T: Float> Module<T> for SimpleModule<T> {
        fn forward(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
            // Just return input for testing.
            Ok(input.clone())
        }

        fn parameters(&self) -> Vec<&Parameter<T>> {
            vec![&self.weight]
        }

        fn parameters_mut(&mut self) -> Vec<&mut Parameter<T>> {
            vec![&mut self.weight]
        }

        fn named_parameters(&self) -> Vec<(String, &Parameter<T>)> {
            vec![("weight".to_string(), &self.weight)]
        }

        fn train(&mut self) {
            self.training = true;
        }

        fn eval(&mut self) {
            self.training = false;
        }

        fn is_training(&self) -> bool {
            self.training
        }
    }

    #[test]
    fn test_module_parameters() {
        let m = SimpleModule::<f32>::new(5).unwrap();
        assert_eq!(m.parameters().len(), 1);
        assert_eq!(m.parameters()[0].shape(), &[5]);
    }

    #[test]
    fn test_module_named_parameters() {
        let m = SimpleModule::<f32>::new(3).unwrap();
        let named = m.named_parameters();
        assert_eq!(named.len(), 1);
        assert_eq!(named[0].0, "weight");
    }

    #[test]
    fn test_module_train_eval() {
        let mut m = SimpleModule::<f32>::new(2).unwrap();
        assert!(m.is_training());
        m.eval();
        assert!(!m.is_training());
        m.train();
        assert!(m.is_training());
    }

    #[test]
    fn test_module_state_dict_roundtrip() {
        let m = SimpleModule::<f32>::new(4).unwrap();
        let sd = m.state_dict();
        assert!(sd.contains_key("weight"));
        assert_eq!(sd["weight"].shape(), &[4]);

        let mut m2 = SimpleModule::<f32>::new(4).unwrap();
        m2.load_state_dict(&sd, true).unwrap();
    }

    #[test]
    fn test_module_state_dict_strict_extra_key() {
        let mut m = SimpleModule::<f32>::new(3).unwrap();
        let mut sd = HashMap::new();
        sd.insert(
            "weight".to_string(),
            ferrotorch_core::zeros::<f32>(&[3]).unwrap(),
        );
        sd.insert(
            "extra".to_string(),
            ferrotorch_core::zeros::<f32>(&[1]).unwrap(),
        );

        assert!(m.load_state_dict(&sd, true).is_err());
        assert!(m.load_state_dict(&sd, false).is_ok());
    }

    #[test]
    fn test_module_state_dict_shape_mismatch() {
        let mut m = SimpleModule::<f32>::new(3).unwrap();
        let mut sd = HashMap::new();
        sd.insert(
            "weight".to_string(),
            ferrotorch_core::zeros::<f32>(&[5]).unwrap(),
        );

        assert!(m.load_state_dict(&sd, true).is_err());
    }

    #[test]
    fn test_module_is_send_sync() {
        fn assert_send_sync<T: Send + Sync>() {}
        assert_send_sync::<SimpleModule<f32>>();
    }

    #[test]
    fn test_reduction_enum() {
        assert_eq!(Reduction::Mean, Reduction::Mean);
        assert_ne!(Reduction::Mean, Reduction::Sum);
    }

    #[test]
    fn test_to_device_cpu_preserves_weights() {
        let mut m = SimpleModule::<f32>::new(4).unwrap();
        m.to_device(ferrotorch_core::Device::Cpu).unwrap();
        assert_eq!(m.parameters().len(), 1);
        assert_eq!(m.parameters()[0].shape(), &[4]);
    }

    #[test]
    fn test_to_device_cuda_without_backend() {
        let mut m = SimpleModule::<f32>::new(3).unwrap();
        let result = m.to_device(ferrotorch_core::Device::Cuda(0));
        assert!(result.is_err());
    }

    // -----------------------------------------------------------------------
    // Module trait additions: buffers / children / zero_grad / requires_grad_ /
    // apply_to_parameters / modules iteration. (#583)
    // -----------------------------------------------------------------------

    /// A module with one parameter, one buffer, and a child.
    struct ParentModule<T: Float> {
        weight: Parameter<T>,
        running_mean: Buffer<T>,
        child: SimpleModule<T>,
    }

    impl<T: Float> ParentModule<T> {
        fn new() -> FerrotorchResult<Self> {
            Ok(Self {
                weight: Parameter::ones(&[2, 2])?,
                running_mean: Buffer::zeros(&[2])?,
                child: SimpleModule::new(3)?,
            })
        }
    }

    impl<T: Float> Module<T> for ParentModule<T> {
        fn forward(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
            self.child.forward(input)
        }

        fn parameters(&self) -> Vec<&Parameter<T>> {
            // self.weight + child.parameters()
            let mut out: Vec<&Parameter<T>> = vec![&self.weight];
            out.extend(self.child.parameters());
            out
        }

        fn parameters_mut(&mut self) -> Vec<&mut Parameter<T>> {
            let mut out: Vec<&mut Parameter<T>> = vec![&mut self.weight];
            out.extend(self.child.parameters_mut());
            out
        }

        fn named_parameters(&self) -> Vec<(String, &Parameter<T>)> {
            let mut out: Vec<(String, &Parameter<T>)> = vec![("weight".to_string(), &self.weight)];
            for (n, p) in self.child.named_parameters() {
                out.push((format!("child.{n}"), p));
            }
            out
        }

        fn buffers(&self) -> Vec<&Buffer<T>> {
            vec![&self.running_mean]
        }

        fn buffers_mut(&mut self) -> Vec<&mut Buffer<T>> {
            vec![&mut self.running_mean]
        }

        fn named_buffers(&self) -> Vec<(String, &Buffer<T>)> {
            vec![("running_mean".to_string(), &self.running_mean)]
        }

        fn children(&self) -> Vec<&dyn Module<T>> {
            vec![&self.child]
        }

        fn named_children(&self) -> Vec<(String, &dyn Module<T>)> {
            vec![("child".to_string(), &self.child)]
        }

        fn train(&mut self) {
            self.child.train();
        }

        fn eval(&mut self) {
            self.child.eval();
        }

        fn is_training(&self) -> bool {
            self.child.is_training()
        }
    }

    #[test]
    fn module_buffers_default_is_empty() {
        // SimpleModule doesn't override buffers() — default impl returns empty.
        let m = SimpleModule::<f32>::new(3).unwrap();
        assert!(m.buffers().is_empty());
        assert!(m.named_buffers().is_empty());
    }

    #[test]
    fn module_buffers_listed_for_overriding_module() {
        let m = ParentModule::<f32>::new().unwrap();
        assert_eq!(m.buffers().len(), 1);
        assert_eq!(m.buffers()[0].shape(), &[2]);
        let nb = m.named_buffers();
        assert_eq!(nb.len(), 1);
        assert_eq!(nb[0].0, "running_mean");
    }

    #[test]
    fn module_children_listed_for_parent() {
        let m = ParentModule::<f32>::new().unwrap();
        assert_eq!(m.children().len(), 1);
        assert_eq!(m.named_children().len(), 1);
        assert_eq!(m.named_children()[0].0, "child");
    }

    #[test]
    fn module_named_modules_includes_self_and_descendants() {
        let m = ParentModule::<f32>::new().unwrap();
        let nm = m.named_modules();
        // Root + 1 child = 2 entries.
        assert_eq!(nm.len(), 2);
        assert_eq!(nm[0].0, "");
        assert_eq!(nm[1].0, "child");
    }

    #[test]
    fn module_modules_includes_self_and_descendants() {
        let m = ParentModule::<f32>::new().unwrap();
        let mods = m.modules();
        assert_eq!(mods.len(), 2);
    }

    #[test]
    fn module_zero_grad_succeeds() {
        // No grads yet on a fresh module — zero_grad should still succeed.
        let m = SimpleModule::<f32>::new(3).unwrap();
        m.zero_grad().unwrap();
    }

    #[test]
    fn module_requires_grad_toggles_all_parameters() {
        let mut m = ParentModule::<f32>::new().unwrap();
        for p in m.parameters() {
            assert!(p.requires_grad());
        }
        m.requires_grad_(false);
        for p in m.parameters() {
            assert!(!p.requires_grad());
        }
        m.requires_grad_(true);
        for p in m.parameters() {
            assert!(p.requires_grad());
        }
    }

    #[test]
    fn module_apply_to_parameters_visits_all() {
        let mut m = ParentModule::<f32>::new().unwrap();
        let n_params = m.parameters().len();
        let mut count = 0;
        m.apply_to_parameters(&mut |_p| count += 1);
        assert_eq!(count, n_params);
    }

    #[test]
    fn module_state_dict_includes_buffers() {
        let m = ParentModule::<f32>::new().unwrap();
        let sd = m.state_dict();
        assert!(sd.contains_key("weight"));
        assert!(sd.contains_key("running_mean"));
        assert!(sd.contains_key("child.weight"));
        assert_eq!(sd.len(), 3);
    }

    #[test]
    fn module_load_state_dict_with_buffer() {
        let mut m = ParentModule::<f32>::new().unwrap();
        let mut sd: StateDict<f32> = HashMap::new();
        sd.insert(
            "weight".into(),
            ferrotorch_core::ones::<f32>(&[2, 2]).unwrap(),
        );
        sd.insert(
            "running_mean".into(),
            ferrotorch_core::from_slice::<f32>(&[7.0, 9.0], &[2]).unwrap(),
        );
        sd.insert(
            "child.weight".into(),
            ferrotorch_core::zeros::<f32>(&[3]).unwrap(),
        );
        m.load_state_dict(&sd, true).unwrap();
        assert_eq!(m.buffers()[0].data().unwrap(), &[7.0, 9.0]);
    }

    #[test]
    fn module_descendants_dyn_excludes_self() {
        let m = ParentModule::<f32>::new().unwrap();
        let d = m.descendants_dyn();
        assert_eq!(d.len(), 1);
    }

    #[test]
    fn module_named_descendants_dyn_paths() {
        let m = ParentModule::<f32>::new().unwrap();
        let nd = m.named_descendants_dyn();
        assert_eq!(nd.len(), 1);
        assert_eq!(nd[0].0, "child");
    }

    /// #1142 regression lock: a transparent wrapper module that exposes
    /// its inner child at path `""` must NOT prepend a leading `.` to
    /// the child's own descendant paths.
    ///
    /// The DeepLabV3 model uses this idiom — `DeepLabV3::named_children`
    /// returns `("", &backbone)` and `ResNet50Dilated::named_children`
    /// returns `("backbone", &inner)`. Pre-#1142 the walker produced
    /// `".backbone"`, mismatching state-dict keys like `"backbone.bn1.X"`
    /// and silently failing every BN-buffer load on DeepLabV3's backbone.
    #[test]
    fn module_named_descendants_dyn_empty_parent_no_leading_dot() {
        /// Wraps a `ParentModule` at the empty path. The descendant walker
        /// must reach `ParentModule`'s `("child", _)` entry as the bare
        /// path `"child"`, not `".child"`.
        struct TransparentWrapper<T: Float> {
            inner: ParentModule<T>,
        }
        impl<T: Float> Module<T> for TransparentWrapper<T> {
            fn forward(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
                self.inner.forward(input)
            }
            fn parameters(&self) -> Vec<&Parameter<T>> {
                self.inner.parameters()
            }
            fn parameters_mut(&mut self) -> Vec<&mut Parameter<T>> {
                self.inner.parameters_mut()
            }
            fn named_parameters(&self) -> Vec<(String, &Parameter<T>)> {
                self.inner.named_parameters()
            }
            fn children(&self) -> Vec<&dyn Module<T>> {
                vec![&self.inner]
            }
            fn named_children(&self) -> Vec<(String, &dyn Module<T>)> {
                vec![(String::new(), &self.inner)]
            }
            fn train(&mut self) {
                self.inner.train();
            }
            fn eval(&mut self) {
                self.inner.eval();
            }
            fn is_training(&self) -> bool {
                self.inner.is_training()
            }
        }
        let m = TransparentWrapper::<f32> {
            inner: ParentModule::new().unwrap(),
        };
        let nd: Vec<String> = m
            .named_descendants_dyn()
            .into_iter()
            .map(|(n, _)| n)
            .collect();
        // 2 entries: ("" -> inner) and ("child" -> grandchild).
        assert_eq!(nd, vec![String::new(), "child".to_string()]);
        for p in &nd {
            assert!(
                !p.starts_with('.'),
                "transparent-wrapper descendant path '{p}' starts with '.'; \
                 the empty-parent branch in named_descendants_dyn has regressed",
            );
        }
    }

    // -------------------------------------------------------------------
    // Hook-registration trait methods (#606)
    // -------------------------------------------------------------------

    #[test]
    fn with_forward_hook_wraps_and_fires() {
        use std::sync::atomic::{AtomicUsize, Ordering};
        let m = SimpleModule::<f32>::new(2).unwrap();
        let counter = std::sync::Arc::new(AtomicUsize::new(0));
        let counter_for_hook = std::sync::Arc::clone(&counter);

        let (wrapped, _handle) = m.with_forward_hook(Box::new(move |_input, _output| {
            counter_for_hook.fetch_add(1, Ordering::SeqCst);
        }));

        let input = ferrotorch_core::Tensor::from_storage(
            ferrotorch_core::TensorStorage::cpu(vec![1.0_f32, 2.0]),
            vec![2],
            false,
        )
        .unwrap();
        let _ = wrapped.forward(&input).unwrap();
        assert_eq!(counter.load(Ordering::SeqCst), 1);
    }

    #[test]
    fn with_forward_pre_hook_wraps_and_fires() {
        use std::sync::atomic::{AtomicUsize, Ordering};
        let m = SimpleModule::<f32>::new(2).unwrap();
        let counter = std::sync::Arc::new(AtomicUsize::new(0));
        let counter_for_hook = std::sync::Arc::clone(&counter);

        let (wrapped, _handle) = m.with_forward_pre_hook(Box::new(move |input| {
            counter_for_hook.fetch_add(1, Ordering::SeqCst);
            Ok(input.clone())
        }));

        let input = ferrotorch_core::Tensor::from_storage(
            ferrotorch_core::TensorStorage::cpu(vec![1.0_f32, 2.0]),
            vec![2],
            false,
        )
        .unwrap();
        let _ = wrapped.forward(&input).unwrap();
        assert_eq!(counter.load(Ordering::SeqCst), 1);
    }

    #[test]
    fn with_backward_hook_returns_handle() {
        // backward hook fires only on the backward pass; just verify the
        // wrapping API resolves and returns a usable HookedModule + handle.
        let m = SimpleModule::<f32>::new(2).unwrap();
        let (wrapped, handle) = m.with_backward_hook(Box::new(|_gi, _go| {}));
        // Wrapper still implements Module<T> trait — slot it into a forward
        // call to confirm it round-trips.
        let input = ferrotorch_core::Tensor::from_storage(
            ferrotorch_core::TensorStorage::cpu(vec![3.0_f32]),
            vec![1],
            false,
        )
        .unwrap();
        let _ = wrapped.forward(&input).unwrap();
        // Handle is droppable; explicit remove is also fine.
        handle.remove();
    }
}