anamnesis 0.5.0

Parse any tensor format, recover any precision — framework-agnostic FP8/GPTQ/AWQ/BnB dequantization, NPZ parsing, and PyTorch .pth conversion for Rust
Documentation
// SPDX-License-Identifier: MIT OR Apache-2.0

//! **ἀνάμνησις** — parse any tensor format, recover any precision.
//!
//! `anamnesis` is a framework-agnostic Rust library for dequantizing
//! quantized model weights and parsing tensor archives. It handles
//! `.safetensors` (memory-mapped, classify, dequantize to `BF16`),
//! `.npz` (bulk extraction at near-I/O speed), and `PyTorch` `.pth`
//! (zero-copy mmap with lossless safetensors conversion, 11–31× faster
//! than `torch.load()`).
//!
//! # Supported Quantization Schemes (decode — [`remember`])
//!
//! | Scheme | Feature gate | Speedup vs `PyTorch` CPU (AVX2) |
//! |--------|-------------|-------------------------------|
//! | `FP8` `E4M3` (fine-grained, per-channel, per-tensor) | *(always on)* | 2.7–9.7× |
//! | `GPTQ` (`INT4`/`INT8`, group-wise, `g_idx`) | `gptq` | 6.5–12.2× |
//! | `AWQ` (`INT4`, per-group, activation-aware) | `awq` | 4.7–5.7× |
//! | `BitsAndBytes` `NF4`/`FP4` (lookup + per-block absmax) | `bnb` | 18–54× |
//! | `BitsAndBytes` `INT8` (`LLM.int8()`, per-row absmax) | `bnb` | 1.2× |
//!
//! All schemes produce **bit-exact** output (0 ULP difference) against
//! `PyTorch` reference implementations, verified on real models.
//!
//! # Supported Quantization Schemes (encode — [`lethe`])
//!
//! Phase 5 introduces the encode side as the architectural inverse of
//! [`remember`]. Each kernel here takes the `BF16` bytes that
//! [`remember`] produces and writes the corresponding quantised bytes
//! plus the per-block / per-row metadata (`absmax`, `SCB`, …), so
//! `encode(decode(q, scale)) == q` holds bit-exactly for every codebook-
//! `LUT` family (`NF4`, `FP4`) and within `i8` representation error
//! plus the documented clamp at `± 127` for `INT8`.
//!
//! | Scheme | Feature gate | Cross-validation contract |
//! |--------|-------------|---------------------------|
//! | `BitsAndBytes` `NF4`/`FP4` encode | `bnb` | 0-ULP bit-exact round trip on every fixture |
//! | `BitsAndBytes` `INT8` encode | `bnb` | 0-ULP bit-exact round trip on every fixture |
//!
//! Subsequent encode-kernel families (`FP8`, `GGUF` legacy / `K-quants`
//! / `IQ` / `TQ` / `MXFP4`) land in Phase 7.5 and reuse the
//! `lethe::round_trip` harness introduced here.
//!
//! # `NPZ`/`NPY` Parsing
//!
//! Feature-gated behind `npz`. Custom `NPY` header parser with bulk
//! `read_exact` — zero per-element deserialization for LE data on LE
//! machines. Supports `F16`, `BF16`, `F32`, `F64`, all integer types,
//! and `Bool`. **3,586 MB/s** on a 302 MB file (1.3× raw I/O overhead).
//!
//! # `PyTorch` `.pth` Parsing
//!
//! Feature-gated behind `pth`. Minimal pickle VM (~36 opcodes) with
//! security allowlist. Memory-mapped I/O with zero-copy `Cow::Borrowed`
//! tensor data. Lossless `.pth` → `.safetensors` conversion.
//! **11–31× faster** than `torch.load()` on torchvision models.
//!
//! # Quick Start
//!
//! Path-based dequantisation (FP8 → BF16):
//!
//! ```rust,no_run
//! use anamnesis::{parse, TargetDtype};
//!
//! let model = parse("model-fp8.safetensors")?;
//! let info = model.inspect();
//! println!("{info}");
//!
//! model.remember("model-bf16.safetensors", TargetDtype::BF16)?;
//! # Ok::<(), anamnesis::AnamnesisError>(())
//! ```
//!
//! Reader-generic inspection over any `Read + Seek` substrate (in-memory
//! `Cursor`, `HTTP`-range-backed adapter, custom transport). The example
//! below uses a `std::fs::File`; an `HTTP`-range adapter from a
//! downstream crate (e.g. `hf-fm`'s `HttpRangeReader`) plugs in
//! identically — anamnesis itself stays HTTP-free. Four reader-generic
//! entry points cover the supported tensor formats:
//!
//! ```rust,no_run
//! # #[cfg(all(feature = "npz", feature = "pth", feature = "gguf"))]
//! # fn run() -> anamnesis::Result<()> {
//! use anamnesis::{
//!     inspect_gguf_from_reader, inspect_npz_from_reader,
//!     inspect_pth_from_reader, parse_safetensors_header_from_reader,
//! };
//!
//! let st_header = parse_safetensors_header_from_reader(
//!     std::fs::File::open("shard.safetensors")?,
//! )?;
//! let npz_info = inspect_npz_from_reader(std::fs::File::open("weights.npz")?)?;
//! let gguf_info = inspect_gguf_from_reader(std::fs::File::open("model.gguf")?)?;
//! let pth_info = inspect_pth_from_reader(std::fs::File::open("model.pth")?)?;
//! # let _ = (st_header, npz_info, gguf_info, pth_info);
//! # Ok(()) }
//! ```
//!
//! # Architecture
//!
//! - [`parse()`] — memory-map a `.safetensors` file into a
//!   [`ParsedModel`]. Inspect-only workflows touch only the header
//!   (~1 MiB) regardless of file size; full dequantisation pages
//!   tensor bytes in lazily.
//! - [`ParsedModel::inspect`] — derive format, tensor counts, and size
//!   estimates from the parsed header (zero further I/O)
//! - [`ParsedModel::remember`] — dequantize all quantized tensors to `BF16`
//!   and write a standard `.safetensors` file
//! - [`parse_safetensors_header`] / [`parse_safetensors_header_from_reader`]
//!   — header-only safetensors parsing. The reader-generic variant accepts
//!   any `Read` substrate (in-memory `Cursor`, `HTTP`-range-backed adapter,
//!   …) and reads only the 8-byte length prefix plus the `JSON` header,
//!   so a multi-GB shard's metadata can be inspected with a single
//!   ~1 MiB sequential fetch.
//! - `parse_npz()` — read an `.npz` archive into a `HashMap<String, NpzTensor>`
//!   (requires `npz` feature)
//! - `inspect_npz()` / `inspect_npz_from_reader()` — header-only `NPZ`
//!   inspection. The reader-generic variant accepts any `Read + Seek`
//!   substrate (in-memory `Cursor`, HTTP-range-backed adapter, …) so callers
//!   can extract tensor metadata without materialising the data segment
//!   (requires `npz` feature)
//! - `parse_gguf()` / `inspect_gguf_from_reader()` — `GGUF` parsing /
//!   inspection. The path-based variant memory-maps the file and returns a
//!   `ParsedGguf` with zero-copy tensor views; the reader-generic variant
//!   accepts any `Read + Seek` substrate and returns just the
//!   `GgufInspectInfo` summary, so a multi-GB quantised `GGUF`'s metadata
//!   can be inspected in a few range fetches over the front-loaded header
//!   without downloading the data section (requires `gguf` feature)
//! - `parse_pth()` / `inspect_pth_from_reader()` — `PyTorch` `.pth` parsing
//!   / inspection. The path-based variant memory-maps the file and returns
//!   a `ParsedPth` with zero-copy `tensors()`; the reader-generic variant
//!   accepts any `Read + Seek` substrate and returns just the
//!   `PthInspectInfo` summary, so a torchvision-class `.pth` is inspectable
//!   in a single `<100 KiB` range fetch over the ZIP central directory and
//!   `data.pkl` entry — no tensor-data files inside the archive are read
//!   (requires `pth` feature)
//! - `pth_to_safetensors()` — lossless `.pth` → `.safetensors` conversion
//!   (requires `pth` feature)
//!
//! The [`remember`] module contains one submodule per quantization family
//! ([`remember::fp8`] always-on; `remember::gptq`, `remember::awq`,
//! `remember::bnb` feature-gated independently under `gptq` / `awq` /
//! `bnb`).
//!
//! The [`lethe`] module mirrors that layout on the encode side. Phase 5
//! ships `lethe::bnb` (feature-gated behind `bnb`) plus the
//! always-on `lethe::round_trip` validation harness. Encoding a fresh
//! `BF16` source into `BnB-NF4`:
//!
//! ```rust,no_run
//! # #[cfg(feature = "bnb")]
//! # fn run() -> anamnesis::Result<()> {
//! use anamnesis::{encode_bnb4_compute_absmax, NF4_CODEBOOK};
//!
//! // 64 BF16 elements arranged as one 64-element block.
//! let bf16_bytes: Vec<u8> = vec![0u8; 64 * 2];
//! let codebook_bytes: Vec<u8> =
//!     NF4_CODEBOOK.iter().flat_map(|v| v.to_le_bytes()).collect();
//! let (weight, absmax) =
//!     encode_bnb4_compute_absmax(&bf16_bytes, &codebook_bytes, 64, 64)?;
//! assert_eq!(weight.len(), 32);   // 64 elements / 2 nibbles per byte
//! assert_eq!(absmax.len(), 4);    // 1 block × F32 LE
//! # Ok(()) }
//! ```

// `deny` (not `forbid`) allows feature-gated modules to opt in to unsafe
// where required by external APIs (e.g., memmap2 in the `pth` module).
// See CONVENTIONS.md "// SAFETY:" rules for the policy.
#![deny(unsafe_code)]
#![deny(warnings)]
// Allow unknown lint names so that `#[allow(clippy::newer_lint)]` in test
// modules does not become an error when built with MSRV clippy (which may
// not recognise lints added in later releases). Without this, every new
// clippy lint suppression is a potential MSRV CI break.
#![allow(unknown_lints)]

/// Command-line interface implementation shared by the `anamnesis` and
/// `amn` binaries. Feature-gated behind `cli`; pulls in [`clap`] only
/// when enabled.
#[cfg(feature = "cli")]
pub mod cli;
pub mod error;
pub mod inspect;
pub mod lethe;
pub mod model;
pub mod parse;
pub mod remember;

pub use error::{AnamnesisError, Result};
pub use inspect::{format_bytes, InspectInfo};
#[cfg(feature = "bnb")]
pub use lethe::{
    encode_bnb4, encode_bnb4_compute_absmax, encode_bnb4_double_quant, encode_bnb_int8,
    encode_bnb_int8_compute_scb, FP4_CODEBOOK, NF4_CODEBOOK,
};
pub use model::{parse, ParsedModel, TargetDtype};
#[cfg(feature = "gguf")]
pub use parse::{
    inspect_gguf_from_reader, parse_gguf, GgufInspectInfo, GgufMetadataArray, GgufMetadataValue,
    GgufTensor, GgufTensorInfo, GgufType, ParsedGguf,
};
#[cfg(feature = "npz")]
pub use parse::{
    inspect_npz, inspect_npz_from_reader, parse_npz, NpzDtype, NpzInspectInfo, NpzTensor,
    NpzTensorInfo,
};
#[cfg(feature = "pth")]
pub use parse::{
    inspect_pth_from_reader, parse_pth, ParsedPth, PthDtype, PthInspectInfo, PthTensor,
    PthTensorInfo,
};
pub use parse::{
    parse_safetensors_header, parse_safetensors_header_from_reader, AwqCompanions, AwqConfig,
    Bnb4Companions, BnbConfig, Dtype, GptqCompanions, GptqConfig, QuantScheme, SafetensorsHeader,
    TensorEntry, TensorRole,
};
#[cfg(feature = "awq")]
pub use remember::dequantize_awq_to_bf16;
#[cfg(feature = "gptq")]
pub use remember::dequantize_gptq_to_bf16;
#[cfg(feature = "bnb")]
pub use remember::{dequantize_bnb4_to_bf16, dequantize_bnb_int8_to_bf16};
pub use remember::{
    dequantize_fp8_to_bf16, dequantize_per_channel_fp8_to_bf16, dequantize_per_tensor_fp8_to_bf16,
};
#[cfg(feature = "gguf")]
pub use remember::{dequantize_gguf_blocks_to_bf16, dequantize_gguf_to_bf16};
#[cfg(feature = "pth")]
pub use remember::{pth_to_safetensors, pth_to_safetensors_bytes};