sipp-rs 0.1.0

Unified Rust library for extensible Sipp inference
use crate::runtime::config::{SchedulerPolicyConfig, SchedulerPolicyMode, SchedulerTickBudget};
use crate::runtime::numeric::{nonnegative_i32, positive_i32};

const THROUGHPUT_PREFILL_FLOOR_NUMERATOR: i32 = 3;
const THROUGHPUT_PREFILL_FLOOR_DENOMINATOR: i32 = 4;
const MIN_THROUGHPUT_DECODE_RESERVE: i32 = 1;
const BALANCED_PREFILL_FLOOR: i32 = 1;

pub(super) fn build_tick_budget(
    policy: SchedulerPolicyConfig,
    decode_ready_count: i32,
    prefill_ready_count: i32,
    max_batch_tokens: i32,
) -> SchedulerTickBudget {
    let mut budget = SchedulerTickBudget {
        total_token_budget: nonnegative_i32(max_batch_tokens),
        decode_first: decode_ready_count > 0,
        ..SchedulerTickBudget::default()
    };

    if budget.total_token_budget <= 0 {
        return budget;
    }

    let clamped_decode_ready = nonnegative_i32(decode_ready_count);
    let clamped_prefill_ready = nonnegative_i32(prefill_ready_count);

    if clamped_decode_ready == 0 {
        budget.reserved_decode_tokens = 0;
        budget.reserved_prefill_tokens = budget.total_token_budget;
        return budget;
    }

    if clamped_prefill_ready == 0 {
        budget.reserved_decode_tokens = clamped_decode_ready.min(budget.total_token_budget);
        budget.reserved_prefill_tokens = budget.total_token_budget - budget.reserved_decode_tokens;
        return budget;
    }

    let requested_decode_reserve = requested_decode_reserve(policy, clamped_decode_ready);
    let decode_ready_budget = clamped_decode_ready.min(budget.total_token_budget);

    budget.reserved_decode_tokens = match policy.mode {
        SchedulerPolicyMode::LatencyFirst => {
            if has_decode_reserve(policy) {
                decode_ready_budget.min(requested_decode_reserve)
            } else {
                decode_ready_budget
            }
        }
        SchedulerPolicyMode::ThroughputFirst => {
            let prefill_floor = throughput_prefill_floor(budget.total_token_budget);
            let decode_ceiling = positive_i32(budget.total_token_budget - prefill_floor);
            let throughput_reserve = if has_decode_reserve(policy) {
                requested_decode_reserve
            } else {
                MIN_THROUGHPUT_DECODE_RESERVE
            };
            decode_ready_budget
                .min(decode_ceiling)
                .min(throughput_reserve)
        }
        SchedulerPolicyMode::Balanced => {
            let prefill_floor = balanced_prefill_floor(budget.total_token_budget);
            let decode_ceiling = remaining_token_budget(budget.total_token_budget, prefill_floor);
            let mut decode_tokens = decode_ready_budget.min(decode_ceiling);
            if has_decode_reserve(policy) {
                decode_tokens = decode_tokens.min(requested_decode_reserve);
            }
            decode_tokens
        }
    };

    budget.reserved_prefill_tokens =
        remaining_token_budget(budget.total_token_budget, budget.reserved_decode_tokens);
    budget
}

fn has_decode_reserve(policy: SchedulerPolicyConfig) -> bool {
    policy.decode_token_reserve > 0
}

fn requested_decode_reserve(policy: SchedulerPolicyConfig, decode_ready_count: i32) -> i32 {
    if has_decode_reserve(policy) {
        policy.decode_token_reserve.min(decode_ready_count)
    } else {
        decode_ready_count
    }
}

fn throughput_prefill_floor(total_token_budget: i32) -> i32 {
    if total_token_budget > 1 {
        positive_i32(
            (total_token_budget * THROUGHPUT_PREFILL_FLOOR_NUMERATOR)
                / THROUGHPUT_PREFILL_FLOOR_DENOMINATOR,
        )
    } else {
        0
    }
}

fn balanced_prefill_floor(total_token_budget: i32) -> i32 {
    if total_token_budget > 1 {
        BALANCED_PREFILL_FLOOR
    } else {
        0
    }
}

fn remaining_token_budget(total_token_budget: i32, reserved_tokens: i32) -> i32 {
    nonnegative_i32(total_token_budget - reserved_tokens)
}