sipp-rs 0.1.0

Unified Rust library for extensible Sipp inference
use std::borrow::Cow;

use serde::{Deserialize, Serialize};

use crate::defaults::BYTES_PER_MIB_U64;

use super::{args_len, positive_or_none, push_arg, push_csv_arg, push_flag, push_optional_arg};

const ALWAYS_EMITTED_KEY_VALUE_ARGS: usize = 3;
const BASE_ARG_LEN: usize = ALWAYS_EMITTED_KEY_VALUE_ARGS * super::KEY_VALUE_ARG_LEN;

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(default, deny_unknown_fields)]
pub struct ModelPlacementConfig {
    pub devices: Vec<String>,
    pub gpu_layers: GpuLayerConfig,
    pub split_mode: SplitMode,
    pub main_gpu: Option<i32>,
    pub tensor_split: Vec<f32>,
    pub use_mmap: bool,
    pub use_mlock: bool,
    pub fit_params: bool,
    pub fit_params_min_ctx: Option<i32>,
    pub fit_params_target_bytes: Vec<u64>,
    pub check_tensors: bool,
    pub no_extra_bufts: bool,
    pub no_host: bool,
}

impl Default for ModelPlacementConfig {
    fn default() -> Self {
        Self {
            devices: Vec::new(),
            gpu_layers: GpuLayerConfig::Auto,
            split_mode: SplitMode::Layer,
            main_gpu: None,
            tensor_split: Vec::new(),
            use_mmap: cfg!(not(target_family = "wasm")),
            use_mlock: false,
            fit_params: false,
            fit_params_min_ctx: None,
            fit_params_target_bytes: Vec::new(),
            check_tensors: false,
            no_extra_bufts: false,
            no_host: false,
        }
    }
}

impl ModelPlacementConfig {
    pub(super) fn normalize(&mut self) {
        if let GpuLayerConfig::Count(count) = self.gpu_layers {
            self.gpu_layers = GpuLayerConfig::from_layer_count(count);
        }
        self.main_gpu = positive_or_none(self.main_gpu, 0);
        self.fit_params_min_ctx = positive_or_none(self.fit_params_min_ctx, 1);
    }

    pub(super) fn arg_len(&self) -> usize {
        args_len(
            BASE_ARG_LEN,
            [
                !self.devices.is_empty(),
                self.main_gpu.is_some(),
                !self.tensor_split.is_empty(),
                self.fit_params_min_ctx.is_some(),
                !self.fit_params_target_bytes.is_empty(),
            ],
            [
                self.use_mlock,
                !self.use_mmap,
                self.check_tensors,
                self.no_extra_bufts,
                self.no_host,
            ],
        )
    }

    pub(super) fn push_args(&self, args: &mut Vec<String>) {
        if !self.devices.is_empty() {
            push_csv_arg(args, "--device", self.devices.iter());
        }
        push_arg(args, "--gpu-layers", self.gpu_layers.to_llama_arg());
        push_arg(args, "--split-mode", self.split_mode.as_llama_arg());
        push_optional_arg(args, "--main-gpu", self.main_gpu);
        if !self.tensor_split.is_empty() {
            push_csv_arg(args, "--tensor-split", self.tensor_split.iter());
        }
        push_arg(args, "--fit", if self.fit_params { "on" } else { "off" });
        push_optional_arg(args, "--fit-ctx", self.fit_params_min_ctx);
        if !self.fit_params_target_bytes.is_empty() {
            push_csv_arg(
                args,
                "--fit-target",
                self.fit_params_target_bytes
                    .iter()
                    .map(|bytes| bytes / BYTES_PER_MIB_U64),
            );
        }
        push_flag(args, "--mlock", self.use_mlock);
        push_flag(args, "--no-mmap", !self.use_mmap);
        push_flag(args, "--check-tensors", self.check_tensors);
        push_flag(args, "--no-repack", self.no_extra_bufts);
        push_flag(args, "--no-host", self.no_host);
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum GpuLayerConfig {
    Auto,
    All,
    Count(i32),
}

impl GpuLayerConfig {
    pub fn from_layer_count(count: i32) -> Self {
        if count < 0 {
            Self::All
        } else {
            Self::Count(count)
        }
    }

    fn to_llama_arg(self) -> Cow<'static, str> {
        match self {
            Self::Auto => Cow::Borrowed("auto"),
            Self::All => Cow::Borrowed("all"),
            Self::Count(count) => Cow::Owned(count.to_string()),
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SplitMode {
    None,
    Layer,
    Row,
    Tensor,
}

impl SplitMode {
    fn as_llama_arg(self) -> &'static str {
        match self {
            Self::None => "none",
            Self::Layer => "layer",
            Self::Row => "row",
            Self::Tensor => "tensor",
        }
    }
}

#[cfg(test)]
#[path = "../../../tests/runtime/config/inference_config/placement_tests.rs"]
mod placement_tests;