torsh_distributed::expert_parallelism::config

Struct ExpertParallelismConfig

pub struct ExpertParallelismConfig {Show 22 fields
    pub num_experts: usize,
    pub num_experts_per_token: usize,
    pub capacity_factor: f32,
    pub load_balance_loss_coeff: f32,
    pub router_z_loss_coeff: f32,
    pub expert_dropout: f32,
    pub enable_load_balancing: bool,
    pub sharding_strategy: ExpertShardingStrategy,
    pub max_expert_batch_size: Option<usize>,
    pub enable_gradient_accumulation: bool,
    pub gradient_accumulation_steps: usize,
    pub initialization_strategy: ExpertInitStrategy,
    pub enable_expert_sync: bool,
    pub sync_frequency: usize,
    pub gate_network: Option<GateNetworkConfig>,
    pub load_balancing: Option<LoadBalancingConfig>,
    pub migration: Option<ExpertMigrationConfig>,
    pub enable_expert_migration: bool,
    pub migration_threshold: f32,
    pub memory_per_expert_mb: usize,
    pub communication_overlap: bool,
    pub gradient_compression: bool,
}

Expand description

Expert parallelism configuration

This structure contains all the configuration parameters needed to set up and run a Mixture of Experts (MoE) model with distributed expert parallelism.

§Examples

use torsh_distributed::expert_parallelism::config::{ExpertParallelismConfig, ExpertShardingStrategy};

let config = ExpertParallelismConfig {
    num_experts: 16,
    num_experts_per_token: 2,
    capacity_factor: 1.5,
    sharding_strategy: ExpertShardingStrategy::ModelParallel,
    ..Default::default()
};

Fields§

§num_experts: usize

Number of experts in the MoE layer

This determines the total number of expert networks available for routing. Typical values range from 8 to 1024 depending on model size and requirements.

§num_experts_per_token: usize

Number of experts to activate per token (top-k)

Each token is routed to the top-k experts based on router scores. Common values are 1, 2, or 4. Higher values increase computational cost but may improve model quality.

§capacity_factor: f32

Expert capacity factor (capacity = tokens_per_expert * capacity_factor)

This factor determines how many tokens each expert can process. Values > 1.0 provide buffer capacity to handle load imbalance. Typical range: 1.0 to 2.0.

§load_balance_loss_coeff: f32

Load balancing loss coefficient

Weight for the auxiliary loss that encourages balanced expert utilization. Higher values enforce stronger load balancing but may hurt model quality. Typical range: 0.001 to 0.1.

§router_z_loss_coeff: f32

Router z-loss coefficient (for numerical stability)

Weight for the z-loss that encourages router logits to stay close to zero, improving numerical stability. Typical range: 0.0001 to 0.01.

§expert_dropout: f32

Enable expert dropout during training

Probability of randomly dropping experts during training to improve robustness and prevent overfitting. Range: 0.0 to 1.0.

§enable_load_balancing: bool

Enable load balancing across devices

When true, the system actively monitors and rebalances expert utilization across different devices to optimize resource usage.

§sharding_strategy: ExpertShardingStrategy

Expert sharding strategy

Determines how experts are distributed across devices and processes.

§max_expert_batch_size: Option<usize>

Maximum batch size for expert processing

Limits the number of tokens that can be processed by a single expert in one forward pass. Helps control memory usage.

§enable_gradient_accumulation: bool

Enable gradient accumulation across experts

When true, gradients are accumulated across multiple expert invocations before updating parameters, which can improve training stability.

§gradient_accumulation_steps: usize

Number of gradient accumulation steps

Only relevant when gradient accumulation is enabled.

§initialization_strategy: ExpertInitStrategy

Expert initialization strategy

Method used to initialize expert parameters.

§enable_expert_sync: bool

Enable expert synchronization

When true, experts synchronize their parameters periodically during training.

§sync_frequency: usize

Synchronization frequency (in steps)

How often to synchronize expert parameters when synchronization is enabled.

§gate_network: Option<GateNetworkConfig>

Gate network configuration

Optional configuration for hierarchical or advanced gate networks.

§load_balancing: Option<LoadBalancingConfig>

Load balancing configuration

Configuration for expert load balancing and migration.

§migration: Option<ExpertMigrationConfig>

Migration configuration

Configuration for expert migration strategies and triggers.

§enable_expert_migration: bool

Enable expert migration (simplified flag)

§migration_threshold: f32

Migration threshold for triggering migrations

§memory_per_expert_mb: usize

Memory allocated per expert (in MB)

§communication_overlap: bool

Enable communication overlap

§gradient_compression: bool

Enable gradient compression

Struct ExpertParallelismConfig Copy item path

§Examples

Fields§

Implementations§

impl ExpertParallelismConfig

pub fn new() -> Self

pub fn small_scale() -> Self

§Returns

pub fn large_scale() -> Self

§Returns

pub fn inference() -> Self

§Returns

pub fn validate(&self) -> Result<(), String>

§Returns

pub fn calculate_expert_capacity(&self, total_tokens: usize) -> usize

§Arguments

§Returns

pub fn recommended_num_devices(&self) -> usize

§Returns

Trait Implementations§

impl Clone for ExpertParallelismConfig

fn clone(&self) -> ExpertParallelismConfig

fn clone_from(&mut self, source: &Self)

impl Debug for ExpertParallelismConfig

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for ExpertParallelismConfig

fn default() -> Self

impl<'de> Deserialize<'de> for ExpertParallelismConfig

fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where __D: Deserializer<'de>,

impl Serialize for ExpertParallelismConfig

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>where __S: Serializer,

Auto Trait Implementations§

impl Freeze for ExpertParallelismConfig

impl RefUnwindSafe for ExpertParallelismConfig

impl Send for ExpertParallelismConfig

impl Sync for ExpertParallelismConfig

impl Unpin for ExpertParallelismConfig

impl UnsafeUnpin for ExpertParallelismConfig

impl UnwindSafe for ExpertParallelismConfig

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> Same for T

type Output = T

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

Struct ExpertParallelismConfig

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where __D: Deserializer<'de>,

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> CommunicationMessage for T
where T: Serialize + for<'de> Deserialize<'de> + Send + Sync,

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,