trueno/tiling/mod.rs
1//! Tiling Compute Blocks (TCB) - Work Partitioning for High-Performance Kernels
2//!
3//! TCBs represent the fundamental unit of work partitioning within `ComputeBrick` kernels.
4//! While a `ComputeBrick` defines a logical operation (e.g., Q4_K MatMul), a TCB defines
5//! the physical execution strategy—how data is partitioned across the memory hierarchy.
6//!
7//! # Architecture
8//!
9//! Tiling occurs at three levels:
10//! 1. **Macro-Tile (L3/Global Memory)**: Partitioning across CPU sockets or GPU SMs
11//! 2. **Midi-Tile (L2/Shared Memory)**: Partitioning within a thread block or Rayon task
12//! 3. **Micro-Tile (Registers)**: Smallest unit processed by SIMD or CUDA warps
13//!
14//! # Modules
15//!
16//! - `geometry` - TcbGeometry dimensions and level definitions
17//! - `config` - TilingConfig and backend selection
18//! - `calculator` - TcbIndexCalculator for index computation
19//! - `packing` - Memory layout packing utilities
20//! - `prefetch` - Prefetch locality hints
21//! - `q4k_matvec` - Q4_K quantized matrix-vector tiling
22//! - `error` - TilingError types
23
24mod calculator;
25mod config;
26mod error;
27mod geometry;
28mod packing;
29mod prefetch;
30mod q4k_matvec;
31
32pub use calculator::TcbIndexCalculator;
33pub use config::{TilingBackend, TilingConfig};
34pub use error::TilingError;
35pub use geometry::{TcbGeometry, TcbLevel};
36pub use packing::{pack_a_index, pack_b_index, swizzle_index, PackingLayout};
37pub use prefetch::{optimal_prefetch_distance, PrefetchLocality};
38pub use q4k_matvec::{
39 extract_scale_min_6bit, f16_to_f32, TiledQ4KMatvec, TilingStats, Q4K_SUPERBLOCK_BYTES,
40 Q4K_SUPERBLOCK_SIZE,
41};
42
43#[cfg(test)]
44mod tests;