Skip to main content

offline_intelligence/model_management/
recommendation.rs

1//! Model Recommendation System
2//!
3//! Provides hardware-aware model recommendations based on:
4//! - Available RAM and VRAM
5//! - CPU/GPU capabilities
6//! - Model size requirements
7//! - User preferences and use cases
8
9use super::registry::ModelInfo;
10use crate::config::Config;
11use serde::{Deserialize, Serialize};
12use tracing::{debug, info};
13
14// System info only needed on macOS for unified memory detection
15#[cfg(target_os = "macos")]
16use sysinfo::System;
17
18/// User preferences for model recommendations
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct UserPreferences {
21    pub primary_use_case: UseCase,
22    pub quality_preference: QualityPreference,
23    pub speed_preference: SpeedPreference,
24    pub cost_sensitivity: CostSensitivity,
25    pub preferred_formats: Vec<String>,
26}
27
28/// Primary use case for the model
29#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
30pub enum UseCase {
31    ChatAssistant,
32    CodeGeneration,
33    CreativeWriting,
34    ResearchAnalysis,
35    Translation,
36    GeneralPurpose,
37}
38
39/// Quality preference setting
40#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
41pub enum QualityPreference {
42    HighQuality,  // Prefer larger, more capable models
43    Balanced,     // Balance quality and performance
44    FastResponse, // Prioritize speed over quality
45}
46
47/// Speed preference setting
48#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
49pub enum SpeedPreference {
50    Fastest,        // Prioritize inference speed
51    Balanced,       // Balance speed and quality
52    HighestQuality, // Prioritize quality over speed
53}
54
55/// Cost sensitivity (relevant for cloud/API models)
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub enum CostSensitivity {
58    Budget,   // Prefer smaller, free models
59    Moderate, // Balanced approach
60    Premium,  // Willing to use larger/expensive models
61}
62
63impl Default for UserPreferences {
64    fn default() -> Self {
65        Self {
66            primary_use_case: UseCase::GeneralPurpose,
67            quality_preference: QualityPreference::Balanced,
68            speed_preference: SpeedPreference::Balanced,
69            cost_sensitivity: CostSensitivity::Moderate,
70            preferred_formats: vec!["gguf".to_string()], // Default to GGUF for local inference
71        }
72    }
73}
74
75/// Hardware profile detected from system
76#[derive(Debug, Clone)]
77pub struct HardwareProfile {
78    pub total_ram_gb: f32,
79    pub available_ram_gb: f32,
80    pub cpu_cores: u32,
81    pub cpu_threads: u32,
82    pub gpu_available: bool,
83    pub gpu_vram_gb: Option<f32>,
84    pub gpu_compute_capability: Option<f32>,
85    pub system_architecture: String,
86}
87
88/// Model recommender service
89pub struct ModelRecommender {
90    user_preferences: UserPreferences,
91}
92
93impl ModelRecommender {
94    pub fn new() -> Self {
95        Self {
96            user_preferences: UserPreferences::default(),
97        }
98    }
99
100    /// Update user preferences
101    pub fn set_preferences(&mut self, preferences: UserPreferences) {
102        self.user_preferences = preferences;
103        info!("Updated user preferences: {:?}", self.user_preferences);
104    }
105
106    /// Get current user preferences
107    pub fn get_preferences(&self) -> &UserPreferences {
108        &self.user_preferences
109    }
110
111    /// Detect hardware profile from system configuration
112    pub fn detect_hardware_profile(config: &Config) -> HardwareProfile {
113        let mut system = sysinfo::System::new_all();
114        system.refresh_memory();
115        system.refresh_cpu();
116
117        let total_ram_gb = system.total_memory() as f32 / (1024.0 * 1024.0 * 1024.0);
118        let available_ram_gb = system.available_memory() as f32 / (1024.0 * 1024.0 * 1024.0);
119        let cpu_cores = num_cpus::get() as u32;
120        let cpu_threads = config.threads;
121
122        // GPU detection
123        let (gpu_available, gpu_vram_gb, gpu_compute_capability) = Self::detect_gpu();
124
125        HardwareProfile {
126            total_ram_gb,
127            available_ram_gb,
128            cpu_cores,
129            cpu_threads,
130            gpu_available,
131            gpu_vram_gb,
132            gpu_compute_capability,
133            system_architecture: std::env::consts::ARCH.to_string(),
134        }
135    }
136
137    /// Detect GPU capabilities (platform-specific)
138    fn detect_gpu() -> (bool, Option<f32>, Option<f32>) {
139        // Windows and Linux: Use NVML for NVIDIA GPU detection (only when nvidia feature enabled)
140        #[cfg(all(feature = "nvidia", any(target_os = "windows", target_os = "linux")))]
141        {
142            match nvml_wrapper::Nvml::init() {
143                Ok(nvml) => {
144                    match nvml.device_count() {
145                        Ok(count) if count > 0 => {
146                            match nvml.device_by_index(0) {
147                                Ok(device) => {
148                                    let vram_bytes =
149                                        device.memory_info().map(|mem| mem.total).unwrap_or(0);
150                                    let vram_gb = if vram_bytes > 0 {
151                                        Some(vram_bytes as f32 / (1024.0 * 1024.0 * 1024.0))
152                                    } else {
153                                        None
154                                    };
155
156                                    // Simplified compute capability detection
157                                    let compute_capability = Some(7.5); // Default assumption
158
159                                    (true, vram_gb, compute_capability)
160                                }
161                                Err(_) => (false, None, None),
162                            }
163                        }
164                        _ => (false, None, None),
165                    }
166                }
167                Err(_) => (false, None, None),
168            }
169        }
170
171        // Windows and Linux without NVML: fallback - no GPU metrics available at this level
172        #[cfg(all(not(feature = "nvidia"), any(target_os = "windows", target_os = "linux")))]
173        {
174            // GPU layer count is already detected in config.rs via nvidia-smi
175            (false, None, None)
176        }
177
178        // macOS Apple Silicon: Metal GPU with unified memory
179        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
180        {
181            use sysinfo::System;
182            let mut sys = System::new_all();
183            sys.refresh_memory();
184            // Apple Silicon uses unified memory - report total as "VRAM" since it's shared
185            let unified_mem_gb = sys.total_memory() as f32 / (1024.0 * 1024.0 * 1024.0);
186            // Metal compute capability is not directly comparable to CUDA, use a placeholder
187            info!(
188                "Apple Silicon detected with Metal GPU, unified memory: {:.1} GB",
189                unified_mem_gb
190            );
191            (true, Some(unified_mem_gb), None)
192        }
193
194        // macOS Intel: No efficient GPU for LLM inference
195        #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
196        {
197            info!("Intel Mac detected, no GPU acceleration available");
198            (false, None, None)
199        }
200    }
201
202    /// Score a model's compatibility with current hardware
203    pub fn score_model_compatibility(&self, model: &ModelInfo, hardware: &HardwareProfile) -> f32 {
204        let mut score = 1.0f32;
205
206        // RAM requirements check
207        let model_ram_gb = (model.size_bytes as f32) / (1024.0 * 1024.0 * 1024.0) * 1.5; // Estimate with buffer
208
209        if model_ram_gb > hardware.available_ram_gb {
210            score *= 0.3; // Heavy penalty for insufficient RAM
211        } else if model_ram_gb > hardware.total_ram_gb * 0.8 {
212            score *= 0.7; // Moderate penalty for tight RAM
213        }
214
215        // GPU requirements check
216        let requires_gpu = model.tags.contains(&"gpu".to_string())
217            || model.format.eq_ignore_ascii_case("tensorrt");
218
219        if requires_gpu && !hardware.gpu_available {
220            score *= 0.2; // Heavy penalty for GPU requirement without GPU
221        } else if requires_gpu && hardware.gpu_available {
222            if let Some(vram_gb) = hardware.gpu_vram_gb {
223                let required_vram = match model.size_bytes {
224                    s if s < 5 * 1024 * 1024 * 1024 => 6.0, // 5GB model needs ~6GB VRAM
225                    s if s < 10 * 1024 * 1024 * 1024 => 12.0,
226                    _ => 24.0,
227                };
228
229                if vram_gb < required_vram * 0.8 {
230                    score *= 0.5; // Penalty for tight VRAM
231                }
232            }
233        }
234
235        // Format preference bonus
236        if self
237            .user_preferences
238            .preferred_formats
239            .iter()
240            .any(|f| model.format.eq_ignore_ascii_case(f))
241        {
242            score *= 1.2; // Bonus for preferred format
243        }
244
245        // Use case alignment
246        score *= self.score_use_case_alignment(model);
247
248        // Quality/speed preference adjustment
249        score *= self.score_quality_speed_preference(model);
250
251        score.clamp(0.0, 1.0)
252    }
253
254    /// Score how well a model aligns with the user's use case
255    fn score_use_case_alignment(&self, model: &ModelInfo) -> f32 {
256        let use_case_tags: Vec<&str> = model.tags.iter().map(|s| s.as_str()).collect();
257
258        match self.user_preferences.primary_use_case {
259            UseCase::ChatAssistant => {
260                if use_case_tags.contains(&"chat") || use_case_tags.contains(&"instruction") {
261                    1.3
262                } else if use_case_tags.contains(&"general") {
263                    1.1
264                } else {
265                    1.0
266                }
267            }
268            UseCase::CodeGeneration => {
269                if use_case_tags.contains(&"code") || use_case_tags.contains(&"programming") {
270                    1.4
271                } else {
272                    1.0
273                }
274            }
275            UseCase::CreativeWriting => {
276                if use_case_tags.contains(&"creative") || use_case_tags.contains(&"story") {
277                    1.3
278                } else if use_case_tags.contains(&"text-generation") {
279                    1.1
280                } else {
281                    1.0
282                }
283            }
284            UseCase::ResearchAnalysis => {
285                if use_case_tags.contains(&"research") || use_case_tags.contains(&"analysis") {
286                    1.3
287                } else {
288                    1.0
289                }
290            }
291            UseCase::Translation => {
292                if use_case_tags.contains(&"translation") || use_case_tags.contains(&"multilingual")
293                {
294                    1.4
295                } else {
296                    1.0
297                }
298            }
299            UseCase::GeneralPurpose => 1.0,
300        }
301    }
302
303    /// Adjust score based on quality/speed preferences
304    fn score_quality_speed_preference(&self, model: &ModelInfo) -> f32 {
305        let model_size_category = if model.size_bytes < 3 * 1024 * 1024 * 1024 {
306            "small" // < 3GB
307        } else if model.size_bytes < 10 * 1024 * 1024 * 1024 {
308            "medium" // 3-10GB
309        } else {
310            "large" // > 10GB
311        };
312
313        match (
314            &self.user_preferences.quality_preference,
315            &self.user_preferences.speed_preference,
316        ) {
317            (QualityPreference::HighQuality, SpeedPreference::HighestQuality) => {
318                match model_size_category {
319                    "large" => 1.3,
320                    "medium" => 1.1,
321                    "small" => 0.8,
322                    _ => 1.0,
323                }
324            }
325            (QualityPreference::FastResponse, SpeedPreference::Fastest) => {
326                match model_size_category {
327                    "small" => 1.3,
328                    "medium" => 1.0,
329                    "large" => 0.6,
330                    _ => 1.0,
331                }
332            }
333            _ => 1.0, // Balanced preferences
334        }
335    }
336
337    /// Get top recommended models for current hardware and preferences
338    pub fn get_recommendations(
339        &self,
340        models: Vec<&ModelInfo>,
341        hardware: &HardwareProfile,
342        max_results: usize,
343    ) -> Vec<(String, f32)> {
344        let mut scored_models: Vec<(String, f32)> = models
345            .iter()
346            .map(|model| {
347                let score = self.score_model_compatibility(model, hardware);
348                (model.id.clone(), score)
349            })
350            .collect();
351
352        // Sort by compatibility score (descending)
353        scored_models.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
354
355        // Take top results
356        scored_models.truncate(max_results);
357
358        debug!("Generated {} model recommendations", scored_models.len());
359        scored_models
360    }
361
362    /// Get hardware recommendations message
363    pub fn get_hardware_recommendation_message(&self, hardware: &HardwareProfile) -> String {
364        let mut recommendations = Vec::new();
365
366        if hardware.total_ram_gb < 8.0 {
367            recommendations
368                .push("Consider upgrading RAM to 8GB+ for better model performance".to_string());
369        }
370
371        if !hardware.gpu_available && hardware.total_ram_gb >= 16.0 {
372            recommendations.push(
373                "A GPU would significantly accelerate inference for larger models".to_string(),
374            );
375        }
376
377        if hardware.gpu_available {
378            if let Some(vram) = hardware.gpu_vram_gb {
379                if vram < 8.0 {
380                    recommendations.push(format!(
381                        "With {:.1}GB VRAM, you can run medium-sized models efficiently",
382                        vram
383                    ));
384                } else if vram >= 16.0 {
385                    recommendations.push(format!(
386                        "With {:.1}GB VRAM, you can run large models with full GPU acceleration",
387                        vram
388                    ));
389                }
390            }
391        }
392
393        if recommendations.is_empty() {
394            "Your system configuration is well-suited for running various model sizes".to_string()
395        } else {
396            recommendations.join("\n")
397        }
398    }
399}
400
401#[cfg(test)]
402mod tests {
403    use super::*;
404    use crate::config::Config;
405
406    #[test]
407    fn test_hardware_detection() {
408        let config = Config {
409            model_path: "".to_string(),
410            llama_bin: "".to_string(),
411            llama_host: "127.0.0.1".to_string(),
412            llama_port: 8001,
413            ctx_size: 8192,
414            batch_size: 128,
415            threads: 6,
416            gpu_layers: 20,
417            health_timeout_seconds: 600,
418            hot_swap_grace_seconds: 25,
419            max_concurrent_streams: 2,
420            prometheus_port: 9000,
421            api_host: "127.0.0.1".to_string(),
422            api_port: 9999,
423            requests_per_second: 24,
424            generate_timeout_seconds: 300,
425            stream_timeout_seconds: 600,
426            health_check_timeout_seconds: 900,
427            queue_size: 1000,
428            queue_timeout_seconds: 300,
429            backend_url: "http://127.0.0.1:8001".to_string(),
430            openrouter_api_key: "".to_string(),
431        };
432
433        let hardware = ModelRecommender::detect_hardware_profile(&config);
434        assert!(hardware.total_ram_gb > 0.0);
435        assert!(hardware.cpu_cores > 0);
436    }
437}