ferrotorch_nn/
lib.rs

1//! # `ferrotorch-nn` — crate root
2//!
3//! Declares every per-module file, re-exports the canonical public surface
4//! (layer types, `Module` trait, `Parameter`, `Buffer`, container types,
5//! gradient-clipping helpers, the `Module` derive macro), and provides the
6//! `prelude` module that mirrors `from torch import nn` ergonomics.
7//!
8//! ## REQ status (per `.design/ferrotorch-nn/lib.md`)
9//!
10//! | REQ | Status | Evidence |
11//! |---|---|---|
12//! | REQ-1 | SHIPPED | Crate-wide `#![warn(clippy::all, clippy::pedantic)]` + `#![deny(rust_2018_idioms)]` baseline at the top of `lib.rs`; `cargo clippy -p ferrotorch-nn --lib -- -D warnings` enforces on every build. |
13//! | REQ-2 | SHIPPED | 31 `pub mod` declarations cover every per-layer file; `cargo check -p ferrotorch-nn` fails if any module file is missing. |
14//! | REQ-3 | SHIPPED | Flat `pub use` re-exports surface every layer + utility name at crate root, mirroring `torch/nn/__init__.py:11-50`; consumed by `ferrotorch-optim/src/optimizer.rs` (line 5) `use ferrotorch_nn::Parameter` and every model crate. |
15//! | REQ-4 | SHIPPED | `pub use ferrotorch_nn_derive::Module` republishes the derive macro under the trait's name (separate namespaces); consumed by every `#[derive(Module)]` site in downstream layer code. |
16//! | REQ-5 | SHIPPED | `pub mod prelude` collects core abstractions + standard layers + canonical losses + gradient-clipping helpers; consumed by downstream training scripts writing `use ferrotorch_nn::prelude::*`. |
17//! | REQ-6 | SHIPPED | `#[allow(unused_extern_crates)] extern crate self as ferrotorch_nn;` enables the derive macro's `::ferrotorch_nn::Module` hygienic path; consumed implicitly by every `#[derive(Module)]` macro expansion inside this crate. |
18// Lint baseline mirrors the workspace-standard pattern from
19// `ferrotorch-core`/`-distributed`/`-jit`/`-cubecl`/`-xpu` lib.rs.
20#![warn(clippy::all, clippy::pedantic)]
21#![deny(rust_2018_idioms)]
22// `missing_docs` and `missing_debug_implementations` are held at `allow`
23// while the workspace-wide rustdoc / `Debug` pass is tracked separately
24// (matches the existing `ferrotorch-core`/`-gpu`/`-distributed` precedent —
25// diverging unilaterally from a leaf crate would be Step 4 architectural
26// unilateralism). Several modules expose `Box<dyn Module<T>>` trait
27// objects whose `Debug` impls require careful hand-rolling.
28#![allow(missing_docs, missing_debug_implementations)]
29// Pedantic lints we explicitly accept across this crate. Each allow names
30// a concrete reason — the alternative would be churn-for-zero-benefit or
31// a worse API. Mirrors the ferrotorch-core / ferrotorch-distributed
32// baseline; add to this list only with a one-line justification.
33#![allow(
34    // The crate is laid out so submodule names (`module::Module`,
35    // `parameter::Parameter`, `loss::MSELoss`) match the public type they
36    // export; renaming would force ergonomic breakage.
37    clippy::module_name_repetitions,
38    // # Errors / # Panics sections are added as part of focused passes
39    // (this audit's Finding #5 covers the high-leverage NotImplementedOnCuda
40    // sites in loss.rs); a blanket sweep is tracked separately.
41    clippy::missing_errors_doc,
42    clippy::missing_panics_doc,
43    // NN code casts pervasively between `usize` (shape, indices) and
44    // floating-point (norms, scales) and between `f32`/`f64` (mixed
45    // precision). The explicit cast is more readable than a `cast()` call
46    // through num-traits in arithmetic-heavy kernels.
47    clippy::cast_possible_truncation,
48    clippy::cast_possible_wrap,
49    clippy::cast_sign_loss,
50    clippy::cast_precision_loss,
51    clippy::cast_lossless,
52    // `#[must_use]` on every getter is churn for marginal value; callers
53    // in this codebase already use the returned values.
54    clippy::must_use_candidate,
55    // Builder-style methods returning `Self` document their pattern in
56    // the type signature; `#[must_use]` is noise.
57    clippy::return_self_not_must_use,
58    // Doc comments follow the standard rustdoc layout; pedantic
59    // doc-markdown rules are too aggressive for the technical prose
60    // (PyTorch op names, math notation).
61    clippy::doc_markdown,
62    // Test/helper modules define small fns after `let`-bindings; the
63    // hoisting requirement is style-only.
64    clippy::items_after_statements,
65    // Long `match`-on-reduction / op blocks mirror PyTorch's taxonomy 1:1;
66    // splitting reduces legibility.
67    clippy::too_many_lines,
68    // Most NN ops take large structs / tensors by reference already; the
69    // pedantic threshold flags `Reduction` (Copy, single byte) when passed
70    // by value.
71    clippy::needless_pass_by_value,
72    // `if let Some(x) = y { ... } else { ... }` is the idiomatic Rust
73    // shape; the lint's preferred ladders are less readable.
74    clippy::option_if_let_else,
75    // Trivial getters return references to small `Copy` fields; the
76    // suggested change to direct field exposure would conflict with the
77    // pub-field-via-non_exhaustive policy used by loss configuration types.
78    clippy::trivially_copy_pass_by_ref,
79    // Long literals appear in test reference values (PyTorch parity
80    // fixtures); separating with underscores reduces fidelity to the
81    // original numeric source.
82    clippy::unreadable_literal,
83    // `match` on small enums (Reduction, GeluApproximate, …) is more
84    // legible than chained `if let`.
85    clippy::single_match_else,
86    // Pred / target / grad / loss share short identifiers in math-heavy
87    // kernels; `similar_names` flags them but renaming hurts readability
88    // for readers familiar with the PyTorch reference.
89    clippy::similar_names,
90    // Math kernels naturally use single-character names (m, k, n for matmul
91    // dims; i, j for indices); requiring longer names hurts readability.
92    clippy::many_single_char_names,
93    // `let ... else { return }` rewrites of `match { Some(x) => x, None => ... }`
94    // are often less readable when the match arm is the natural pattern.
95    clippy::manual_let_else,
96    // GPU host-side kernels often have many `&[T]` parameters mirroring a
97    // kernel's input signature; refactoring each into a struct adds churn
98    // without benefit. Per-site comments justify retained allows.
99    clippy::too_many_arguments,
100    // `.collect::<Vec<_>>()` after mapping is the idiomatic shape; rewriting
101    // to extend(map(..)) is lossier and clippy's preference is contested.
102    clippy::redundant_closure_for_method_calls,
103    // Tensor ops naturally use `for i in 0..n { ... }` over `for x in arr.iter()`
104    // when the index itself is needed (multi-dim addressing); the pedantic
105    // preference for `iter()` is contested in this codebase.
106    clippy::needless_range_loop,
107    // Many NN kernels compare floats exactly to zero / one as a fast-path
108    // guard (e.g., `if scale == 0.0 { return zeros }`). The math is exact
109    // for these literal cases; bit-level comparison is intentional.
110    clippy::float_cmp,
111    // Format-string inlining (`format!("{x}")` over `format!("{}", x)`) is
112    // a churn-only sweep; the workspace tracks this in a separate cleanup
113    // pass per the existing precedent in ferrotorch-core / -distributed.
114    clippy::uninlined_format_args,
115    // `.to_vec()` via Deref is the natural shape for tensor slice → owned;
116    // the lint's preferred form (`.iter().copied().collect()`) is uglier.
117    clippy::implicit_clone,
118    // `(a + b) / 2` idiom appears in interpolation; the `midpoint`
119    // intrinsic that clippy suggests has subtle precision differences.
120    clippy::manual_midpoint,
121    // `match` arms wrapping `(a, b)` against `Some` / `None` with explicit
122    // arms are clearer than the `option_map_unit_fn` rewrite.
123    clippy::option_map_unit_fn,
124    // `map(...).unwrap_or(...)` is a perfectly clear idiom; the suggested
125    // `map_or` rewrite is one shorter token but loses the structural shape.
126    clippy::map_unwrap_or,
127    // `as` casts between raw pointers (e.g. `*const T as *const u8`) are
128    // load-bearing in GPU buffer reinterpretation paths; the suggested
129    // `.cast::<U>()` shape obscures the byte-reinterpretation intent.
130    clippy::ptr_as_ptr,
131    // Bind-to-_ patterns appear in tests where the compiler would otherwise
132    // complain about an unused mutability or move; the `_` is intentional.
133    clippy::no_effect_underscore_binding,
134    // `IntoIterator for &Foo` is a legitimate API decision but `&Foo`
135    // here is the right type even without explicit `iter()` blanket impls.
136    clippy::iter_without_into_iter,
137    // Manual `Debug` impls in this crate intentionally elide internal
138    // fields (e.g., heavy tensors) for log readability; clippy wants every
139    // field included.
140    clippy::missing_fields_in_debug,
141    // Shape arrays are naturally `[usize; N]`; the lint's preferred
142    // `Box<[usize]>` would force allocation in a hot path.
143    clippy::single_match,
144    // `if let Some(x) = ... { ... } else { ... }` is the idiomatic Rust
145    // shape for biased Option matching; clippy's `else` rewrites are noisier.
146    clippy::if_not_else,
147    // `unsafe_derive_deserialize` doesn't apply (we don't serde unsafe types
148    // through this crate), but the lint mis-fires on `#[derive(Debug)]` of
149    // structs containing `Box<dyn Module>`. Held until a workspace-wide
150    // serde audit lands.
151    clippy::unsafe_derive_deserialize,
152    // `for x in arr.iter() { ... }` is the explicit form some readers
153    // prefer; the rewrite to `for x in arr` is a style-only refactor.
154    clippy::explicit_iter_loop,
155    // `if cond { return ...; } else { ... }` with the else block holding
156    // the natural fall-through is a deliberate style choice when the
157    // early-return path is logically the exception; clippy wants to
158    // collapse it.
159    clippy::redundant_else,
160)]
161
162// Allow the proc macro's generated code (`::ferrotorch_nn::Module`, etc.)
163// to resolve when used from *within* this crate (e.g., integration tests
164// compiled as part of ferrotorch-nn itself). This `extern crate` is
165// load-bearing for the derive macro's hygienic path even though it appears
166// unused to the compiler — `rust_2018_idioms` flags it without seeing the
167// macro expansion.
168#[allow(unused_extern_crates)]
169extern crate self as ferrotorch_nn;
170
171pub mod activation;
172pub mod attention;
173pub mod buffer;
174pub mod container;
175pub mod conv;
176pub mod dropout;
177pub mod embedding;
178pub mod flash_attention;
179pub mod flex_attention;
180pub mod functional;
181pub mod hooks;
182pub mod identity;
183pub mod init;
184pub mod lazy_conv;
185pub mod lazy_conv_transpose;
186pub mod lazy_linear;
187pub mod lazy_norm;
188pub mod linear;
189pub mod lora;
190pub mod loss;
191pub mod module;
192pub mod norm;
193pub mod padding;
194pub mod paged_attention;
195pub mod parameter;
196pub mod parameter_container;
197pub mod pooling;
198pub mod qat;
199pub mod rnn;
200pub mod rnn_utils;
201pub mod se;
202pub mod transformer;
203pub mod upsample;
204pub mod utils;
205
206pub use activation::{
207    CELU, ELU, GELU, GLU, GeluApproximate, HardSigmoid, HardSwish, Hardshrink, Hardtanh, LeakyReLU,
208    LogSigmoid, LogSoftmax, Mish, PReLU, RReLU, ReLU, ReLU6, SELU, SiLU, Sigmoid, Softmax,
209    Softmax2d, Softmin, Softplus, Softshrink, Softsign, Tanh, Tanhshrink, Threshold,
210};
211pub use attention::{MultiheadAttention, repeat_kv, reshape_to_heads, transpose_heads_to_2d};
212pub use container::{ModuleDict, ModuleList, Sequential};
213pub use conv::{
214    Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d, StringPadding,
215};
216pub use dropout::{AlphaDropout, Dropout, Dropout1d, Dropout2d, Dropout3d, FeatureAlphaDropout};
217pub use embedding::{Embedding, EmbeddingBag, EmbeddingBagMode};
218pub use flash_attention::{flash_attention, standard_attention};
219pub use flex_attention::{
220    BlockMask, alibi_score_mod, causal_score_mod, flex_attention, relative_position_bias_score_mod,
221};
222pub use hooks::{BackwardHook, ForwardHook, ForwardPreHook, HookHandle, HookedModule};
223pub use identity::{
224    ChannelShuffle, CosineSimilarity, Flatten, Identity, PairwiseDistance, Unflatten,
225};
226pub use init::{FanMode, NonLinearity};
227pub use lazy_conv::{LazyConv1d, LazyConv2d, LazyConv3d};
228pub use lazy_linear::LazyLinear;
229pub use linear::{Bilinear, Linear};
230pub use lora::LoRALinear;
231pub use loss::{
232    BCELoss, BCEWithLogitsLoss, CTCLoss, CosineEmbeddingLoss, CrossEntropyLoss, GaussianNLLLoss,
233    HingeEmbeddingLoss, HuberLoss, KLDivLoss, L1Loss, MSELoss, MarginRankingLoss,
234    MultiLabelSoftMarginLoss, MultiMarginLoss, NLLLoss, PoissonNLLLoss, SmoothL1Loss,
235    TripletMarginLoss,
236};
237pub use module::{Module, Reduction, StateDict};
238// Re-export the derive macro. The derive macro and the trait share the name
239// `Module` but live in different namespaces (macro vs type), so both are
240// usable simultaneously: `use ferrotorch_nn::{Module, ...}` gives the trait,
241// and `#[derive(Module)]` resolves to the derive macro.
242pub use buffer::Buffer;
243pub use ferrotorch_nn_derive::Module;
244pub use norm::{
245    BatchNorm1d, BatchNorm2d, BatchNorm3d, GroupNorm, InstanceNorm1d, InstanceNorm2d,
246    InstanceNorm3d, LayerNorm, LocalResponseNorm, RMSNorm,
247};
248pub use padding::{
249    CircularPad1d, CircularPad2d, CircularPad3d, ConstantPad1d, ConstantPad2d, ConstantPad3d,
250    PaddingMode, ReflectionPad1d, ReflectionPad2d, ReflectionPad3d, ReplicationPad1d,
251    ReplicationPad2d, ReplicationPad3d, ZeroPad1d, ZeroPad2d, ZeroPad3d,
252};
253pub use paged_attention::{KVPage, PagePool, PagedAttentionManager, PagedKVCache};
254pub use parameter::Parameter;
255pub use parameter_container::{ParameterDict, ParameterList};
256pub use pooling::{
257    AdaptiveAvgPool1d, AdaptiveAvgPool2d, AdaptiveAvgPool3d, AdaptiveMaxPool1d, AdaptiveMaxPool2d,
258    AdaptiveMaxPool3d, AvgPool1d, AvgPool2d, AvgPool3d, FractionalMaxPool2d, LPPool1d, LPPool2d,
259    MaxPool1d, MaxPool2d, MaxPool3d, MaxUnpool2d, adaptive_avg_pool1d, adaptive_avg_pool2d,
260    adaptive_avg_pool3d, adaptive_max_pool1d, adaptive_max_pool2d, adaptive_max_pool3d, avg_pool1d,
261    avg_pool2d, avg_pool3d, lp_pool1d, lp_pool2d, max_pool1d, max_pool2d, max_pool3d, max_unpool2d,
262};
263pub use qat::{ObserverType, QatConfig, QatModel, QuantizedModel, prepare_qat};
264pub use rnn::{GRU, GRUCell, LSTM, LSTMCell, RNN, RNNCell, RNNNonlinearity};
265pub use rnn_utils::{PackedSequence, pack_padded_sequence, pad_packed_sequence, pad_sequence};
266pub use se::SqueezeExcitation;
267pub use transformer::{
268    KVCache, RoPEConvention, RoPEScaling, RotaryPositionEmbedding, SwiGLU, Transformer,
269    TransformerDecoder, TransformerDecoderLayer, TransformerEncoder, TransformerEncoderLayer,
270};
271pub use upsample::{
272    Fold, GridSampleMode, GridSamplePaddingMode, InterpolateMode, PixelShuffle, PixelUnshuffle,
273    Unfold, Upsample, affine_grid, fold, grid_sample, interpolate, pixel_shuffle, pixel_unshuffle,
274    unfold,
275};
276pub use utils::{clip_grad_norm_, clip_grad_value_};
277
278/// Glob-import-friendly re-exports of the most commonly used items.
279///
280/// Pulls in the core building blocks needed to write a model: the `Module`
281/// trait + derive macro, `Parameter`, `StateDict`, the standard layers
282/// (`Linear`, `Conv2d`, `LayerNorm`, `GELU`, …), the canonical losses
283/// (`MSELoss`, `CrossEntropyLoss`), and the gradient-clipping helpers.
284///
285/// Mirrors PyTorch's `from torch import nn` ergonomics.
286///
287/// ```ignore
288/// use ferrotorch_nn::prelude::*;
289/// ```
290pub mod prelude {
291    // Core abstractions: trait, parameter, state-dict, derive macro.
292    pub use crate::buffer::Buffer;
293    pub use crate::module::{Module, Reduction, StateDict};
294    pub use crate::parameter::Parameter;
295    pub use ferrotorch_nn_derive::Module as DeriveModule;
296
297    // Standard layers most models use.
298    pub use crate::activation::{GELU, ReLU, Sigmoid, Softmax, Tanh};
299    pub use crate::container::{ModuleDict, ModuleList, Sequential};
300    pub use crate::conv::{Conv1d, Conv2d, Conv3d};
301    pub use crate::dropout::Dropout;
302    pub use crate::embedding::Embedding;
303    pub use crate::linear::Linear;
304    pub use crate::norm::{BatchNorm1d, BatchNorm2d, GroupNorm, LayerNorm, RMSNorm};
305    pub use crate::pooling::{AdaptiveAvgPool2d, MaxPool2d};
306
307    // Canonical losses.
308    pub use crate::loss::{BCELoss, BCEWithLogitsLoss, CrossEntropyLoss, L1Loss, MSELoss, NLLLoss};
309
310    // Gradient clipping (utils).
311    pub use crate::utils::{clip_grad_norm_, clip_grad_value_};
312}
ferrotorch_nn/lib.rs

ferrotorch_nn/
lib.rs