alith_interface/llms/local/mod.rs
1#[cfg(any(target_os = "linux", target_os = "windows"))]
2use alith_devices::devices::CudaConfig;
3use alith_devices::devices::DeviceConfig;
4#[cfg(target_os = "macos")]
5use alith_devices::devices::MetalConfig;
6use alith_models::local_model::{
7 LocalLLMModel, gguf::GgufLoader, metadata::llm::DEFAULT_CONTEXT_LENGTH,
8};
9
10pub mod llama_cpp;
11pub mod mistral_rs;
12pub mod ort;
13
14#[derive(Clone, Debug)]
15pub struct LocalLLMConfig {
16 pub batch_size: u64,
17 pub inference_ctx_size: u64,
18 pub device_config: DeviceConfig,
19}
20
21impl Default for LocalLLMConfig {
22 fn default() -> Self {
23 Self {
24 batch_size: 512,
25 inference_ctx_size: DEFAULT_CONTEXT_LENGTH,
26 device_config: DeviceConfig::default(),
27 }
28 }
29}
30
31impl LocalLLMConfig {
32 pub fn load_model(&mut self, mut llm_loader: GgufLoader) -> crate::Result<LocalLLMModel> {
33 let model = if llm_loader.gguf_local_loader.local_quant_file_path.is_none()
34 || llm_loader.gguf_hf_loader.hf_quant_file_url.is_none()
35 {
36 self.load_preset_model(llm_loader)?
37 } else {
38 llm_loader.load()?
39 };
40
41 if self.inference_ctx_size > model.model_metadata.context_length() {
42 eprintln!(
43 "Given value for ctx_size {} is greater than the model's max {}. Using the models max.",
44 self.inference_ctx_size,
45 model.model_metadata.context_length()
46 );
47 self.inference_ctx_size = model.model_metadata.context_length();
48 };
49
50 self.device_config.layer_count = Some(model.model_metadata.layers.count_blocks());
51 self.device_config.average_layer_size_bytes = Some(
52 model
53 .model_metadata
54 .average_layer_size_bytes(self.inference_ctx_size, Some(self.batch_size))?,
55 );
56 self.device_config.local_model_path = model.local_model_path.to_string_lossy().to_string();
57
58 Ok(model)
59 }
60
61 fn load_preset_model(&mut self, mut llm_loader: GgufLoader) -> crate::Result<LocalLLMModel> {
62 if llm_loader
63 .gguf_preset_loader
64 .preset_with_quantization_level
65 .is_some()
66 {
67 return llm_loader.load();
68 };
69
70 if let Some(preset_with_max_ctx_size) =
71 llm_loader.gguf_preset_loader.preset_with_max_ctx_size
72 {
73 if self.inference_ctx_size > preset_with_max_ctx_size {
74 crate::info!(
75 "Given value for ctx_size {} is greater than preset_with_max_ctx_size {preset_with_max_ctx_size}. Using preset_with_max_ctx_size.",
76 self.inference_ctx_size
77 );
78 self.inference_ctx_size = preset_with_max_ctx_size;
79 };
80 } else {
81 llm_loader.gguf_preset_loader.preset_with_max_ctx_size = Some(self.inference_ctx_size);
82 }
83 llm_loader
84 .gguf_preset_loader
85 .preset_with_available_vram_bytes = Some(self.device_config.available_memory_bytes()?);
86
87 llm_loader.load()
88 }
89}
90
91pub trait LLMLocalTrait {
92 fn config(&mut self) -> &mut LocalLLMConfig;
93
94 /// If enabled, any issues with the configuration will result in an error.
95 /// Otherwise, fallbacks will be used.
96 /// Useful if you have a specific configuration in mind and want to ensure it is used.
97 ///
98 /// # Arguments
99 ///
100 /// * `error_on_config_issue` - A boolean indicating whether to error on configuration issues.
101 ///
102 /// # Default
103 ///
104 /// Defaults to false.
105 fn error_on_config_issue(mut self, error_on_config_issue: bool) -> Self
106 where
107 Self: Sized,
108 {
109 self.config().device_config.error_on_config_issue = error_on_config_issue;
110 self
111 }
112
113 /// Enables or disables GPU usage for inference.
114 ///
115 /// # Arguments
116 ///
117 /// * `use_gpu` - A boolean indicating whether to use GPU (true) or not (false).
118 ///
119 /// # Notes
120 ///
121 /// On macOS, this setting affects Metal usage. On other platforms, it typically
122 /// affects CUDA usage.
123 ///
124 /// # Default
125 ///
126 /// Defaults to true. If set to false, CPU inference will be used.
127 fn use_gpu(mut self, use_gpu: bool) -> Self
128 where
129 Self: Sized,
130 {
131 self.config().device_config.use_gpu = use_gpu;
132 self
133 }
134
135 #[cfg(target_os = "macos")]
136 /// Enables or disables Metal usage for inference on macOS.
137 ///
138 /// # Arguments
139 ///
140 /// * `use_metal` - A boolean indicating whether to use Metal (true) or not (false).
141 ///
142 /// # Notes
143 ///
144 /// This method is only available on macOS and is equivalent to `use_gpu`.
145 ///
146 /// # Default
147 ///
148 /// Defaults to true on macOS.
149 fn use_metal(mut self, use_metal: bool) -> Self
150 where
151 Self: Sized,
152 {
153 self.config().device_config.use_gpu = use_metal;
154 self
155 }
156
157 /// Disables GPU usage and forces CPU-only inference.
158 ///
159 /// # Notes
160 ///
161 /// This is equivalent to calling `use_gpu(false)`.
162 ///
163 /// # Default
164 ///
165 /// Defaults to false.
166 fn cpu_only(mut self) -> Self
167 where
168 Self: Sized,
169 {
170 self.config().device_config.use_gpu = false;
171 self
172 }
173
174 /// Sets the number of CPU threads to use for inference.
175 ///
176 /// # Arguments
177 ///
178 /// * `threads` - The number of CPU threads to use.
179 ///
180 /// # Notes
181 ///
182 /// If loading purely in VRAM, this defaults to 1.
183 fn threads(mut self, threads: i16) -> Self
184 where
185 Self: Sized,
186 {
187 self.config().device_config.cpu_config.threads = Some(threads);
188 self
189 }
190
191 /// Sets the number of CPU threads to use for batching and prompt processing.
192 ///
193 /// # Arguments
194 ///
195 /// * `threads_batch` - The number of CPU threads to use for batching and prompt processing.
196 ///
197 /// # Default
198 ///
199 /// If not set, defaults to a percentage of the total system threads.
200 fn threads_batch(mut self, threads_batch: i16) -> Self
201 where
202 Self: Sized,
203 {
204 self.config().device_config.cpu_config.threads_batch = Some(threads_batch);
205 self
206 }
207
208 /// Sets the batch size for inference.
209 ///
210 /// # Arguments
211 ///
212 /// * `batch_size` - The batch size to use.
213 ///
214 /// # Default
215 ///
216 /// If not set, defaults to 512.
217 fn batch_size(mut self, batch_size: u64) -> Self
218 where
219 Self: Sized,
220 {
221 self.config().batch_size = batch_size;
222 self
223 }
224
225 /// Sets the inference context size (maximum token limit for inference output).
226 ///
227 /// # Arguments
228 ///
229 /// * `inference_ctx_size` - The maximum number of tokens the model can generate as output.
230 ///
231 /// # Notes
232 ///
233 /// This value is set when the model is loaded and cannot be changed after.
234 /// If not set, a default value will be used.
235 fn inference_ctx_size(mut self, inference_ctx_size: u64) -> Self
236 where
237 Self: Sized,
238 {
239 self.config().inference_ctx_size = inference_ctx_size;
240 self
241 }
242
243 /// Sets the amount of RAM to use for inference.
244 ///
245 /// # Arguments
246 ///
247 /// * `available_ram_gb` - The amount of RAM to use, in gigabytes.
248 ///
249 /// # Effects
250 ///
251 /// - On macOS: Affects all inference operations.
252 /// - On Windows and Linux: Affects CPU inference only.
253 ///
254 /// # Default Behavior
255 ///
256 /// If this method is not called, the amount of RAM used will default to a percentage
257 /// of the total system RAM. See `use_ram_percentage` for details on setting this percentage.
258 ///
259 /// # Notes
260 ///
261 /// The input value is converted to bytes internally. Precision may be affected for
262 /// very large values due to floating-point to integer conversion.
263 fn use_ram_gb(mut self, available_ram_gb: f32) -> Self
264 where
265 Self: Sized,
266 {
267 self.config().device_config.ram_config.use_ram_bytes =
268 (available_ram_gb * 1_073_741_824f32) as u64;
269 #[cfg(target_os = "macos")]
270 {
271 if let Some(metal_config) = &mut self.config().device_config.metal_config {
272 metal_config.use_ram_bytes = (available_ram_gb * 1_073_741_824f32) as u64;
273 } else {
274 let metal_config = MetalConfig {
275 use_ram_bytes: (available_ram_gb * 1_073_741_824f32) as u64,
276 ..Default::default()
277 };
278 self.config().device_config.metal_config = Some(metal_config);
279 }
280 }
281 self
282 }
283
284 /// Sets the percentage of total system RAM to use for inference.
285 ///
286 /// # Arguments
287 ///
288 /// * `use_ram_percentage` - The percentage of total system RAM to use, expressed as a float
289 /// between 0.0 and 1.0.
290 ///
291 /// # Effects
292 ///
293 /// - On macOS: Affects all inference operations.
294 /// - On Windows and Linux: Affects CPU inference only.
295 ///
296 /// # Default Behavior
297 ///
298 /// If neither this method nor `use_ram_gb` is called, the system will use 70% (0.7) of
299 /// the available RAM by default for windows and linux or 90% (0.9) for macOS.
300 ///
301 /// # Precedence
302 ///
303 /// This setting is only used if `use_ram_gb` has not been called. If `use_ram_gb` has been
304 /// set, that value takes precedence over the percentage set here.
305 ///
306 /// # Notes
307 ///
308 /// It's recommended to set this value conservatively to avoid potential system instability
309 /// or performance issues caused by memory pressure.
310 fn use_ram_percentage(mut self, use_ram_percentage: f32) -> Self
311 where
312 Self: Sized,
313 {
314 self.config().device_config.ram_config.use_percentage = use_ram_percentage;
315 #[cfg(target_os = "macos")]
316 {
317 if let Some(metal_config) = &mut self.config().device_config.metal_config {
318 metal_config.use_percentage = use_ram_percentage;
319 } else {
320 let metal_config = MetalConfig {
321 use_percentage: use_ram_percentage,
322 ..Default::default()
323 };
324 self.config().device_config.metal_config = Some(metal_config);
325 }
326 }
327 self
328 }
329
330 #[cfg(any(target_os = "linux", target_os = "windows"))]
331 /// Sets the CUDA configuration for GPU inference.
332 ///
333 /// # Arguments
334 ///
335 /// * `cuda_config` - The CUDA configuration to use.
336 ///
337 /// # Notes
338 ///
339 /// This method is only available on non-macOS platforms.
340 /// If not set, CUDA devices will be automatically detected.
341 fn cuda_config(mut self, cuda_config: CudaConfig) -> Self
342 where
343 Self: Sized,
344 {
345 self.config().device_config.cuda_config = Some(cuda_config);
346 self
347 }
348
349 #[cfg(target_os = "macos")]
350 /// Sets the Metal configuration for GPU inference on macOS.
351 ///
352 /// # Arguments
353 ///
354 /// * `metal_config` - The Metal configuration to use.
355 ///
356 /// # Notes
357 ///
358 /// This method is only available on macOS.
359 fn metal_config(mut self, metal_config: MetalConfig) -> Self
360 where
361 Self: Sized,
362 {
363 self.config().device_config.metal_config = Some(metal_config);
364 self
365 }
366}