Skip to main content

ferrotorch_whisper/
lib.rs

1// Crate-level lint baseline. Mirrors the ferrotorch-bert posture:
2// deny correctness / idiom / Debug / docs problems; warn pedantic
3// stylistic issues. Specific pedantic lints are allowed crate-wide
4// where the lint is consistently wrong for ML/numeric kernel code.
5
6#![deny(unsafe_code)]
7#![deny(rust_2018_idioms)]
8#![deny(missing_debug_implementations)]
9#![deny(missing_docs)]
10#![warn(clippy::all)]
11#![warn(clippy::pedantic)]
12// Casts: dimension math (`as usize`, `as f32`, `as u32`) is intrinsic
13// to tensor indexing — every kernel call would otherwise need a
14// per-call allow.
15#![allow(clippy::cast_possible_truncation)]
16#![allow(clippy::cast_precision_loss)]
17#![allow(clippy::cast_sign_loss)]
18#![allow(clippy::cast_possible_wrap)]
19#![allow(clippy::cast_lossless)]
20// Builder-style accessors don't all need `#[must_use]`.
21#![allow(clippy::must_use_candidate)]
22// Identifiers like `bf16`, `f32`, `LayerNorm`, `Whisper`, `STFT` are
23// flagged as missing backticks even when they appear in code-fenced
24// text.
25#![allow(clippy::doc_markdown)]
26// `needless_pass_by_value` would force `&WhisperConfig` signatures
27// throughout, hiding intent in the API.
28#![allow(clippy::needless_pass_by_value)]
29// `unnecessary_wraps` flags `Result`-returning helpers that today
30// always succeed but are part of an extensible API surface.
31#![allow(clippy::unnecessary_wraps)]
32// `uninlined_format_args` flags `format!("x={}", x)` vs
33// `format!("x={x}")`. Both are equally clear; the fixup churn is high.
34#![allow(clippy::uninlined_format_args)]
35// `many_single_char_names` flags conventional ML kernel locals
36// (`q`, `k`, `v`, `h`).
37#![allow(clippy::many_single_char_names)]
38// `similar_names` flags variable pairs that are intentionally similar
39// (e.g. `q2` / `q_h`).
40#![allow(clippy::similar_names)]
41// `module_name_repetitions`: every type starts with `Whisper`
42// (matching the HF naming) — the lint would force renames that lose
43// the upstream-1:1 mapping.
44#![allow(clippy::module_name_repetitions)]
45
46//! Whisper-family audio encoder model composition for ferrotorch.
47//!
48//! Assembles the encoder half of OpenAI's Whisper model from ferrotorch
49//! primitives:
50//!
51//! ```text
52//! WhisperEncoder
53//! ├── WhisperConvStem
54//! │   ├── Conv1d (conv1: num_mel_bins → d_model, k=3, stride=1, pad=1, bias)
55//! │   └── Conv1d (conv2: d_model → d_model,     k=3, stride=2, pad=1, bias)
56//! ├── embed_positions (sinusoidal, loaded from state-dict as a parameter)
57//! └── WhisperEncoderLayer × N
58//!     ├── LayerNorm  (self_attn_layer_norm)                  ← PRE-NORM
59//!     ├── WhisperEncoderSelfAttention
60//!     │   ├── Linear q_proj    [d_model, d_model] (bias)
61//!     │   ├── Linear k_proj    [d_model, d_model] (NO bias)
62//!     │   ├── Linear v_proj    [d_model, d_model] (bias)
63//!     │   └── Linear out_proj  [d_model, d_model] (bias)
64//!     ├── LayerNorm  (final_layer_norm)                      ← PRE-NORM
65//!     ├── Linear fc1 [d_model, encoder_ffn_dim] (bias) + GELU
66//!     └── Linear fc2 [encoder_ffn_dim, d_model] (bias)
67//! └── LayerNorm (layer_norm — final encoder LayerNorm)
68//! ```
69//!
70//! # Audio preprocessing
71//!
72//! [`audio::log_mel_spectrogram`] turns 16 kHz mono `f32` PCM into the
73//! `[1, 80, 3000]` log-mel tensor the encoder consumes. The 80-bin
74//! filter bank is shipped as the embedded binary asset
75//! `assets/mel_filters_80x201.bin`, byte-for-byte equal to
76//! `WhisperFeatureExtractor.mel_filters.T`, so any drift between this
77//! module and the reference is in the STFT / log / clip / normalize
78//! pipeline — never in the mel scale.
79//!
80//! # Loading real weights
81//!
82//! [`WhisperEncoder::load_hf_state_dict`] accepts a `StateDict` whose
83//! keys use the HuggingFace `WhisperModel` naming convention. It
84//! filters out non-encoder keys (decoder / `proj_out` / etc.) and
85//! returns a [`encoder::DropReport`] documenting every drop so the pin
86//! script can confirm no encoder key was silently lost. Combined with
87//! `ferrotorch_serialize::load_safetensors` and the
88//! [`load_whisper_encoder`] helper this gives a direct path from a
89//! downloaded `openai/whisper-tiny` checkpoint to an encoder ready to
90//! produce `[1, 1500, 384]` hidden states.
91//!
92//! # Out of scope
93//!
94//! The decoder (cross-attention, kv-cache, beam search) is intentionally
95//! not implemented in this crate. Phase B.2 of real-artifact-driven
96//! development is encoder-only.
97//!
98//! ## REQ status (per `.design/<area>/<file>.md`)
99//!
100//! | REQ | Status | Evidence |
101//! | --- | --- | --- |
102//! | REQ-1 | SHIPPED | impl: `#![deny(...)]` / `#![allow(...)]` block at `lib.rs:5-44`; non-test consumer: enforced by every other file in the crate. |
103//! | REQ-2 | SHIPPED | impl: `pub mod` declarations in `lib.rs`; non-test consumer: every other `.rs` file in the crate uses `crate::<mod>::...` paths. |
104//! | REQ-3 | SHIPPED | impl: `pub use` block at `lib.rs:105-110`; non-test consumer: downstream binaries import these names directly. |
105//! | REQ-4 | SHIPPED | impl: `//!` doc-comment block at `lib.rs:46-96`; non-test consumer: published via `cargo doc -p ferrotorch-whisper`. |
106
107pub mod attention;
108pub mod audio;
109pub mod config;
110pub mod encoder;
111pub mod layer;
112pub mod safetensors_loader;
113
114pub use attention::WhisperEncoderSelfAttention;
115pub use audio::{N_FRAMES, N_MELS, SAMPLE_RATE, log_mel_spectrogram};
116pub use config::{HfWhisperConfig, WhisperConfig};
117pub use encoder::{DropReport, WhisperConvStem, WhisperEncoder};
118pub use layer::WhisperEncoderLayer;
119pub use safetensors_loader::load_whisper_encoder;