Struct NestedPrefixPenalty

Source

pub struct NestedPrefixPenalty {
    pub target: PsiSlice,
    pub target_tier: PenaltyTier,
    pub prefix_sizes: Vec<usize>,
    pub shell_weights: Vec<f64>,
    pub eps: f64,
    pub rho_indices: Vec<usize>,
    pub weight_schedule: Option<ScalarWeightSchedule>,
}

Expand description

Nested-prefix sparsity penalty used by the Matryoshka SAE (Bussmann/Nabeshima/Karvonen/Nanda, ICML 2025, arXiv:2503.17547).

Given K nested prefix sizes m_1 < m_2 < ... < m_K ≤ F over the latent dimension F, and per-shell weights λ_k = w_k · exp(ρ_k), the penalty is

  P(t; ρ) = Σ_k λ_k · Σ_{i=0}^{m_k - 1} sqrt(t_i² + ε²)

summed over all rows of the latent target. Equivalently, coordinate i contributes with effective weight W_i = Σ_{k: m_k > i} λ_k, so the earliest atoms (small i) are penalized by every shell (= strongest L¹) and the latest atoms only by the outermost shell. This is exactly the mask-weighted sum-of-L¹ over K prefixes used to enforce shell-wise reconstruction during Matryoshka training.

Closed forms (per row, summed across all rows):

  ∂P/∂t_i      = W_i · t_i / sqrt(t_i² + ε²)
  Hess_diag(i) = W_i · ε² / (t_i² + ε²)^{3/2}           (PSD)
  ∂P/∂ρ_k      = λ_k · Σ_{i < m_k} sqrt(t_i² + ε²)

target lays out n_rows × latent_dim in row-major order (row * F + col). latent_dim is taken from PsiSlice::latent_dim; if absent we fall back to the maximum prefix size, which is the standard Matryoshka convention.

Fields§

§target: PsiSlice§target_tier: PenaltyTier§prefix_sizes: Vec<usize>

Sorted strictly-increasing prefix sizes m_1 < m_2 < ... < m_K.

§shell_weights: Vec<f64>

Per-shell base weights w_k. The effective strength is λ_k = w_k · exp(ρ_k).

§eps: f64

Smoothing parameter ε > 0 for the smoothed-L¹ surrogate sqrt(x² + ε²); the Hessian needs ε > 0 for differentiability at 0.

§rho_indices: Vec<usize>

Local ρ indices for the K per-shell log-strengths.

§weight_schedule: Option<ScalarWeightSchedule>

Struct NestedPrefixPenalty Copy item path

Fields§

Implementations§

impl NestedPrefixPenalty

pub fn new( target: PsiSlice, target_tier: PenaltyTier, prefix_sizes: Vec<usize>, shell_weights: Vec<f64>, eps: f64, ) -> Result<Self, String>

pub fn with_weight_schedule(self, schedule: ScalarWeightSchedule) -> Self

Trait Implementations§

impl AnalyticPenalty for NestedPrefixPenalty

fn tier(&self) -> PenaltyTier

fn value(&self, target: ArrayView1<'_, f64>, rho: ArrayView1<'_, f64>) -> f64

fn grad_target( &self, target: ArrayView1<'_, f64>, rho: ArrayView1<'_, f64>, ) -> Array1<f64>

fn hessian_diag( &self, target: ArrayView1<'_, f64>, rho: ArrayView1<'_, f64>, ) -> Option<Array1<f64>>

fn grad_rho( &self, target: ArrayView1<'_, f64>, rho: ArrayView1<'_, f64>, ) -> Array1<f64>

fn rho_count(&self) -> usize

fn name(&self) -> &str

fn apply_schedule(&mut self, iter: usize)

fn hvp( &self, target: ArrayView1<'_, f64>, rho: ArrayView1<'_, f64>, v: ArrayView1<'_, f64>, ) -> Array1<f64>

fn psd_majorizer_diag( &self, target: ArrayView1<'_, f64>, rho: ArrayView1<'_, f64>, ) -> Option<Array1<f64>>

fn psd_majorizer_hvp( &self, target: ArrayView1<'_, f64>, rho: ArrayView1<'_, f64>, v: ArrayView1<'_, f64>, ) -> Array1<f64>

impl Clone for NestedPrefixPenalty

fn clone(&self) -> NestedPrefixPenalty

fn clone_from(&mut self, source: &Self)

impl Debug for NestedPrefixPenalty

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl PenaltyManifest for NestedPrefixPenalty

const KIND_TAG: &'static str = "nested_prefix"

const PYTHON_WRAPPER: &'static str = "NestedPrefixPenalty"

const ROW_BLOCK_DIAGONAL: bool = true

fn dispatch_tier(&self) -> PenaltyTier

Auto Trait Implementations§

impl Freeze for NestedPrefixPenalty

impl RefUnwindSafe for NestedPrefixPenalty

impl Send for NestedPrefixPenalty

impl Sync for NestedPrefixPenalty

impl Unpin for NestedPrefixPenalty

impl UnsafeUnpin for NestedPrefixPenalty

impl UnwindSafe for NestedPrefixPenalty

Blanket Implementations§

impl<T> Allocation for Twhere T: RefUnwindSafe + Send + Sync,

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> ByRef<T> for T

fn by_ref(&self) -> &T

impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DTwhere ST: ?Sized, DT: ?Sized,

impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DTwhere ST: ?Sized, DT: ?Sized,

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> DistributionExt for Twhere T: ?Sized,

fn rand<T>(&self, rng: &mut (impl Rng + ?Sized)) -> Twhere Self: Distribution<T>,

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Imply<T> for Uwhere T: ?Sized, U: ?Sized,

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> Read<Exclusive, BecauseExclusive> for Twhere T: ?Sized,

impl<T> Same for T

type Output = T

impl<SS, SP> SupersetOf<SS> for SPwhere SS: SubsetOf<SP>,

fn to_subset(&self) -> Option<SS>

fn is_in_subset(&self) -> bool

fn to_subset_unchecked(&self) -> SS

fn from_subset(element: &SS) -> SP

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

Struct NestedPrefixPenalty

impl<T> Allocation for T
where T: RefUnwindSafe + Send + Sync,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DT
where ST: ?Sized, DT: ?Sized,

impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DT
where ST: ?Sized, DT: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T> DistributionExt for T
where T: ?Sized,

fn rand<T>(&self, rng: &mut (impl Rng + ?Sized)) -> T
where Self: Distribution<T>,

impl<T, U> Imply<T> for U
where T: ?Sized, U: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> Read<Exclusive, BecauseExclusive> for T
where T: ?Sized,

impl<SS, SP> SupersetOf<SS> for SP
where SS: SubsetOf<SP>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,