alith_interface/llms/local/
mod.rs

1#[cfg(any(target_os = "linux", target_os = "windows"))]
2use alith_devices::devices::CudaConfig;
3use alith_devices::devices::DeviceConfig;
4#[cfg(target_os = "macos")]
5use alith_devices::devices::MetalConfig;
6use alith_models::local_model::{
7    LocalLLMModel, gguf::GgufLoader, metadata::llm::DEFAULT_CONTEXT_LENGTH,
8};
9
10pub mod llama_cpp;
11pub mod mistral_rs;
12pub mod ort;
13
14#[derive(Clone, Debug)]
15pub struct LocalLLMConfig {
16    pub batch_size: u64,
17    pub inference_ctx_size: u64,
18    pub device_config: DeviceConfig,
19}
20
21impl Default for LocalLLMConfig {
22    fn default() -> Self {
23        Self {
24            batch_size: 512,
25            inference_ctx_size: DEFAULT_CONTEXT_LENGTH,
26            device_config: DeviceConfig::default(),
27        }
28    }
29}
30
31impl LocalLLMConfig {
32    pub fn load_model(&mut self, mut llm_loader: GgufLoader) -> crate::Result<LocalLLMModel> {
33        let model = if llm_loader.gguf_local_loader.local_quant_file_path.is_none()
34            || llm_loader.gguf_hf_loader.hf_quant_file_url.is_none()
35        {
36            self.load_preset_model(llm_loader)?
37        } else {
38            llm_loader.load()?
39        };
40
41        if self.inference_ctx_size > model.model_metadata.context_length() {
42            eprintln!(
43                "Given value for ctx_size {} is greater than the model's max {}. Using the models max.",
44                self.inference_ctx_size,
45                model.model_metadata.context_length()
46            );
47            self.inference_ctx_size = model.model_metadata.context_length();
48        };
49
50        self.device_config.layer_count = Some(model.model_metadata.layers.count_blocks());
51        self.device_config.average_layer_size_bytes = Some(
52            model
53                .model_metadata
54                .average_layer_size_bytes(self.inference_ctx_size, Some(self.batch_size))?,
55        );
56        self.device_config.local_model_path = model.local_model_path.to_string_lossy().to_string();
57
58        Ok(model)
59    }
60
61    fn load_preset_model(&mut self, mut llm_loader: GgufLoader) -> crate::Result<LocalLLMModel> {
62        if llm_loader
63            .gguf_preset_loader
64            .preset_with_quantization_level
65            .is_some()
66        {
67            return llm_loader.load();
68        };
69
70        if let Some(preset_with_max_ctx_size) =
71            llm_loader.gguf_preset_loader.preset_with_max_ctx_size
72        {
73            if self.inference_ctx_size > preset_with_max_ctx_size {
74                crate::info!(
75                    "Given value for ctx_size {} is greater than preset_with_max_ctx_size {preset_with_max_ctx_size}. Using preset_with_max_ctx_size.",
76                    self.inference_ctx_size
77                );
78                self.inference_ctx_size = preset_with_max_ctx_size;
79            };
80        } else {
81            llm_loader.gguf_preset_loader.preset_with_max_ctx_size = Some(self.inference_ctx_size);
82        }
83        llm_loader
84            .gguf_preset_loader
85            .preset_with_available_vram_bytes = Some(self.device_config.available_memory_bytes()?);
86
87        llm_loader.load()
88    }
89}
90
91pub trait LLMLocalTrait {
92    fn config(&mut self) -> &mut LocalLLMConfig;
93
94    /// If enabled, any issues with the configuration will result in an error.
95    /// Otherwise, fallbacks will be used.
96    /// Useful if you have a specific configuration in mind and want to ensure it is used.
97    ///
98    /// # Arguments
99    ///
100    /// * `error_on_config_issue` - A boolean indicating whether to error on configuration issues.
101    ///
102    /// # Default
103    ///
104    /// Defaults to false.
105    fn error_on_config_issue(mut self, error_on_config_issue: bool) -> Self
106    where
107        Self: Sized,
108    {
109        self.config().device_config.error_on_config_issue = error_on_config_issue;
110        self
111    }
112
113    /// Enables or disables GPU usage for inference.
114    ///
115    /// # Arguments
116    ///
117    /// * `use_gpu` - A boolean indicating whether to use GPU (true) or not (false).
118    ///
119    /// # Notes
120    ///
121    /// On macOS, this setting affects Metal usage. On other platforms, it typically
122    /// affects CUDA usage.
123    ///
124    /// # Default
125    ///
126    /// Defaults to true. If set to false, CPU inference will be used.
127    fn use_gpu(mut self, use_gpu: bool) -> Self
128    where
129        Self: Sized,
130    {
131        self.config().device_config.use_gpu = use_gpu;
132        self
133    }
134
135    #[cfg(target_os = "macos")]
136    /// Enables or disables Metal usage for inference on macOS.
137    ///
138    /// # Arguments
139    ///
140    /// * `use_metal` - A boolean indicating whether to use Metal (true) or not (false).
141    ///
142    /// # Notes
143    ///
144    /// This method is only available on macOS and is equivalent to `use_gpu`.
145    ///
146    /// # Default
147    ///
148    /// Defaults to true on macOS.
149    fn use_metal(mut self, use_metal: bool) -> Self
150    where
151        Self: Sized,
152    {
153        self.config().device_config.use_gpu = use_metal;
154        self
155    }
156
157    /// Disables GPU usage and forces CPU-only inference.
158    ///
159    /// # Notes
160    ///
161    /// This is equivalent to calling `use_gpu(false)`.
162    ///
163    /// # Default
164    ///
165    /// Defaults to false.
166    fn cpu_only(mut self) -> Self
167    where
168        Self: Sized,
169    {
170        self.config().device_config.use_gpu = false;
171        self
172    }
173
174    /// Sets the number of CPU threads to use for inference.
175    ///
176    /// # Arguments
177    ///
178    /// * `threads` - The number of CPU threads to use.
179    ///
180    /// # Notes
181    ///
182    /// If loading purely in VRAM, this defaults to 1.
183    fn threads(mut self, threads: i16) -> Self
184    where
185        Self: Sized,
186    {
187        self.config().device_config.cpu_config.threads = Some(threads);
188        self
189    }
190
191    /// Sets the number of CPU threads to use for batching and prompt processing.
192    ///
193    /// # Arguments
194    ///
195    /// * `threads_batch` - The number of CPU threads to use for batching and prompt processing.
196    ///
197    /// # Default
198    ///
199    /// If not set, defaults to a percentage of the total system threads.
200    fn threads_batch(mut self, threads_batch: i16) -> Self
201    where
202        Self: Sized,
203    {
204        self.config().device_config.cpu_config.threads_batch = Some(threads_batch);
205        self
206    }
207
208    /// Sets the batch size for inference.
209    ///
210    /// # Arguments
211    ///
212    /// * `batch_size` - The batch size to use.
213    ///
214    /// # Default
215    ///
216    /// If not set, defaults to 512.
217    fn batch_size(mut self, batch_size: u64) -> Self
218    where
219        Self: Sized,
220    {
221        self.config().batch_size = batch_size;
222        self
223    }
224
225    /// Sets the inference context size (maximum token limit for inference output).
226    ///
227    /// # Arguments
228    ///
229    /// * `inference_ctx_size` - The maximum number of tokens the model can generate as output.
230    ///
231    /// # Notes
232    ///
233    /// This value is set when the model is loaded and cannot be changed after.
234    /// If not set, a default value will be used.
235    fn inference_ctx_size(mut self, inference_ctx_size: u64) -> Self
236    where
237        Self: Sized,
238    {
239        self.config().inference_ctx_size = inference_ctx_size;
240        self
241    }
242
243    /// Sets the amount of RAM to use for inference.
244    ///
245    /// # Arguments
246    ///
247    /// * `available_ram_gb` - The amount of RAM to use, in gigabytes.
248    ///
249    /// # Effects
250    ///
251    /// - On macOS: Affects all inference operations.
252    /// - On Windows and Linux: Affects CPU inference only.
253    ///
254    /// # Default Behavior
255    ///
256    /// If this method is not called, the amount of RAM used will default to a percentage
257    /// of the total system RAM. See `use_ram_percentage` for details on setting this percentage.
258    ///
259    /// # Notes
260    ///
261    /// The input value is converted to bytes internally. Precision may be affected for
262    /// very large values due to floating-point to integer conversion.
263    fn use_ram_gb(mut self, available_ram_gb: f32) -> Self
264    where
265        Self: Sized,
266    {
267        self.config().device_config.ram_config.use_ram_bytes =
268            (available_ram_gb * 1_073_741_824f32) as u64;
269        #[cfg(target_os = "macos")]
270        {
271            if let Some(metal_config) = &mut self.config().device_config.metal_config {
272                metal_config.use_ram_bytes = (available_ram_gb * 1_073_741_824f32) as u64;
273            } else {
274                let metal_config = MetalConfig {
275                    use_ram_bytes: (available_ram_gb * 1_073_741_824f32) as u64,
276                    ..Default::default()
277                };
278                self.config().device_config.metal_config = Some(metal_config);
279            }
280        }
281        self
282    }
283
284    /// Sets the percentage of total system RAM to use for inference.
285    ///
286    /// # Arguments
287    ///
288    /// * `use_ram_percentage` - The percentage of total system RAM to use, expressed as a float
289    ///   between 0.0 and 1.0.
290    ///
291    /// # Effects
292    ///
293    /// - On macOS: Affects all inference operations.
294    /// - On Windows and Linux: Affects CPU inference only.
295    ///
296    /// # Default Behavior
297    ///
298    /// If neither this method nor `use_ram_gb` is called, the system will use 70% (0.7) of
299    /// the available RAM by default for windows and linux or 90% (0.9) for macOS.
300    ///
301    /// # Precedence
302    ///
303    /// This setting is only used if `use_ram_gb` has not been called. If `use_ram_gb` has been
304    /// set, that value takes precedence over the percentage set here.
305    ///
306    /// # Notes
307    ///
308    /// It's recommended to set this value conservatively to avoid potential system instability
309    /// or performance issues caused by memory pressure.
310    fn use_ram_percentage(mut self, use_ram_percentage: f32) -> Self
311    where
312        Self: Sized,
313    {
314        self.config().device_config.ram_config.use_percentage = use_ram_percentage;
315        #[cfg(target_os = "macos")]
316        {
317            if let Some(metal_config) = &mut self.config().device_config.metal_config {
318                metal_config.use_percentage = use_ram_percentage;
319            } else {
320                let metal_config = MetalConfig {
321                    use_percentage: use_ram_percentage,
322                    ..Default::default()
323                };
324                self.config().device_config.metal_config = Some(metal_config);
325            }
326        }
327        self
328    }
329
330    #[cfg(any(target_os = "linux", target_os = "windows"))]
331    /// Sets the CUDA configuration for GPU inference.
332    ///
333    /// # Arguments
334    ///
335    /// * `cuda_config` - The CUDA configuration to use.
336    ///
337    /// # Notes
338    ///
339    /// This method is only available on non-macOS platforms.
340    /// If not set, CUDA devices will be automatically detected.
341    fn cuda_config(mut self, cuda_config: CudaConfig) -> Self
342    where
343        Self: Sized,
344    {
345        self.config().device_config.cuda_config = Some(cuda_config);
346        self
347    }
348
349    #[cfg(target_os = "macos")]
350    /// Sets the Metal configuration for GPU inference on macOS.
351    ///
352    /// # Arguments
353    ///
354    /// * `metal_config` - The Metal configuration to use.
355    ///
356    /// # Notes
357    ///
358    /// This method is only available on macOS.
359    fn metal_config(mut self, metal_config: MetalConfig) -> Self
360    where
361        Self: Sized,
362    {
363        self.config().device_config.metal_config = Some(metal_config);
364        self
365    }
366}