Skip to main content

null_e/cleaners/
ml.rs

1//! ML/AI cleanup module
2//!
3//! Handles cleanup of machine learning development files:
4//! - Hugging Face models and datasets
5//! - Ollama models
6//! - PyTorch model cache
7//! - Keras models
8//! - TensorFlow cache
9//! - Jupyter checkpoints
10
11use super::{calculate_dir_size, get_mtime, CleanableItem, SafetyLevel};
12use crate::error::Result;
13use std::path::PathBuf;
14
15/// ML/AI cleaner
16pub struct MlCleaner {
17    home: PathBuf,
18}
19
20impl MlCleaner {
21    /// Create a new ML cleaner
22    pub fn new() -> Option<Self> {
23        let home = dirs::home_dir()?;
24        Some(Self { home })
25    }
26
27    /// Detect all ML cleanable items
28    pub fn detect(&self) -> Result<Vec<CleanableItem>> {
29        let mut items = Vec::new();
30
31        // Hugging Face
32        items.extend(self.detect_huggingface()?);
33
34        // Ollama
35        items.extend(self.detect_ollama()?);
36
37        // PyTorch
38        items.extend(self.detect_pytorch()?);
39
40        // Keras
41        items.extend(self.detect_keras()?);
42
43        // TensorFlow
44        items.extend(self.detect_tensorflow()?);
45
46        // Jupyter
47        items.extend(self.detect_jupyter()?);
48
49        // LM Studio
50        items.extend(self.detect_lmstudio()?);
51
52        // GPT4All
53        items.extend(self.detect_gpt4all()?);
54
55        Ok(items)
56    }
57
58    /// Detect Hugging Face cache
59    fn detect_huggingface(&self) -> Result<Vec<CleanableItem>> {
60        let hf_paths = [
61            (".cache/huggingface/hub", "HF Models"),
62            (".cache/huggingface/datasets", "HF Datasets"),
63            (".cache/huggingface/transformers", "HF Transformers"),
64        ];
65
66        let mut items = Vec::new();
67
68        for (rel_path, name) in hf_paths {
69            let path = self.home.join(rel_path);
70            if !path.exists() {
71                continue;
72            }
73
74            // List individual models/datasets
75            if let Ok(entries) = std::fs::read_dir(&path) {
76                for entry in entries.filter_map(|e| e.ok()) {
77                    let entry_path = entry.path();
78                    if !entry_path.is_dir() {
79                        continue;
80                    }
81
82                    let entry_name = entry_path.file_name()
83                        .map(|n| n.to_string_lossy().to_string())
84                        .unwrap_or_default();
85
86                    // Skip hidden files and special directories
87                    if entry_name.starts_with('.') || entry_name == "version.txt" {
88                        continue;
89                    }
90
91                    let (size, file_count) = calculate_dir_size(&entry_path)?;
92                    if size < 10_000_000 {
93                        // Skip if less than 10MB
94                        continue;
95                    }
96
97                    // Parse model name from directory structure
98                    let display_name = entry_name
99                        .replace("models--", "")
100                        .replace("datasets--", "")
101                        .replace("--", "/");
102
103                    items.push(CleanableItem {
104                        name: format!("{}: {}", name, display_name),
105                        category: "ML/AI".to_string(),
106                        subcategory: "Hugging Face".to_string(),
107                        icon: "🤗",
108                        path: entry_path,
109                        size,
110                        file_count: Some(file_count),
111                        last_modified: get_mtime(&entry.path()),
112                        description: "Downloaded ML model or dataset. Can be re-downloaded.",
113                        safe_to_delete: SafetyLevel::SafeWithCost,
114                        clean_command: None,
115                    });
116                }
117            }
118        }
119
120        Ok(items)
121    }
122
123    /// Detect Ollama models
124    fn detect_ollama(&self) -> Result<Vec<CleanableItem>> {
125        let ollama_path = self.home.join(".ollama/models");
126
127        if !ollama_path.exists() {
128            return Ok(vec![]);
129        }
130
131        let mut items = Vec::new();
132
133        // Ollama stores models in blobs and manifests
134        let blobs_path = ollama_path.join("blobs");
135        let manifests_path = ollama_path.join("manifests");
136
137        // Get model names from manifests
138        if manifests_path.exists() {
139            self.scan_ollama_manifests(&manifests_path, &blobs_path, &mut items)?;
140        }
141
142        // If no manifests, just report total blobs size
143        if items.is_empty() && blobs_path.exists() {
144            let (size, file_count) = calculate_dir_size(&blobs_path)?;
145            if size > 0 {
146                items.push(CleanableItem {
147                    name: "Ollama Models (all)".to_string(),
148                    category: "ML/AI".to_string(),
149                    subcategory: "Ollama".to_string(),
150                    icon: "🦙",
151                    path: ollama_path,
152                    size,
153                    file_count: Some(file_count),
154                    last_modified: None,
155                    description: "Local LLM models. Can be re-downloaded with 'ollama pull'.",
156                    safe_to_delete: SafetyLevel::SafeWithCost,
157                    clean_command: Some("ollama rm <model>".to_string()),
158                });
159            }
160        }
161
162        Ok(items)
163    }
164
165    /// Scan Ollama manifests for model info
166    fn scan_ollama_manifests(&self, manifests_path: &PathBuf, _blobs_path: &PathBuf, items: &mut Vec<CleanableItem>) -> Result<()> {
167        // manifests/registry.ollama.ai/library/<model>/<tag>
168        let registry_path = manifests_path.join("registry.ollama.ai/library");
169        if !registry_path.exists() {
170            return Ok(());
171        }
172
173        if let Ok(models) = std::fs::read_dir(&registry_path) {
174            for model in models.filter_map(|e| e.ok()) {
175                let model_path = model.path();
176                if !model_path.is_dir() {
177                    continue;
178                }
179
180                let model_name = model_path.file_name()
181                    .map(|n| n.to_string_lossy().to_string())
182                    .unwrap_or_default();
183
184                // Get total size for this model
185                let (size, file_count) = calculate_dir_size(&model_path)?;
186
187                // Estimate actual model size (manifests are small, blobs are big)
188                // This is a rough estimate - actual size requires parsing manifests
189                let estimated_size = size * 1000; // Manifests point to much larger blobs
190
191                if estimated_size > 100_000_000 {
192                    items.push(CleanableItem {
193                        name: format!("Ollama: {}", model_name),
194                        category: "ML/AI".to_string(),
195                        subcategory: "Ollama".to_string(),
196                        icon: "🦙",
197                        path: model_path,
198                        size: estimated_size,
199                        file_count: Some(file_count),
200                        last_modified: get_mtime(&model.path()),
201                        description: "Local LLM model. Use 'ollama rm' to remove.",
202                        safe_to_delete: SafetyLevel::SafeWithCost,
203                        clean_command: Some(format!("ollama rm {}", model_name)),
204                    });
205                }
206            }
207        }
208
209        Ok(())
210    }
211
212    /// Detect PyTorch cache
213    fn detect_pytorch(&self) -> Result<Vec<CleanableItem>> {
214        let torch_paths = [
215            (".cache/torch", "PyTorch Cache"),
216            (".cache/torch/hub", "PyTorch Hub Models"),
217        ];
218
219        let mut items = Vec::new();
220
221        for (rel_path, name) in torch_paths {
222            let path = self.home.join(rel_path);
223            if !path.exists() {
224                continue;
225            }
226
227            let (size, file_count) = calculate_dir_size(&path)?;
228            if size < 10_000_000 {
229                continue;
230            }
231
232            items.push(CleanableItem {
233                name: name.to_string(),
234                category: "ML/AI".to_string(),
235                subcategory: "PyTorch".to_string(),
236                icon: "🔥",
237                path,
238                size,
239                file_count: Some(file_count),
240                last_modified: None,
241                description: "PyTorch model cache. Can be re-downloaded.",
242                safe_to_delete: SafetyLevel::SafeWithCost,
243                clean_command: None,
244            });
245        }
246
247        Ok(items)
248    }
249
250    /// Detect Keras cache
251    fn detect_keras(&self) -> Result<Vec<CleanableItem>> {
252        let keras_path = self.home.join(".keras");
253
254        if !keras_path.exists() {
255            return Ok(vec![]);
256        }
257
258        let models_path = keras_path.join("models");
259        if models_path.exists() {
260            let (size, file_count) = calculate_dir_size(&models_path)?;
261            if size > 10_000_000 {
262                return Ok(vec![CleanableItem {
263                    name: "Keras Models".to_string(),
264                    category: "ML/AI".to_string(),
265                    subcategory: "Keras".to_string(),
266                    icon: "🧠",
267                    path: models_path,
268                    size,
269                    file_count: Some(file_count),
270                    last_modified: None,
271                    description: "Keras pre-trained models. Can be re-downloaded.",
272                    safe_to_delete: SafetyLevel::SafeWithCost,
273                    clean_command: None,
274                }]);
275            }
276        }
277
278        Ok(vec![])
279    }
280
281    /// Detect TensorFlow cache
282    fn detect_tensorflow(&self) -> Result<Vec<CleanableItem>> {
283        let tf_paths = [
284            (".tensorflow", "TensorFlow Cache"),
285            (".cache/tensorflow", "TensorFlow Hub Cache"),
286        ];
287
288        let mut items = Vec::new();
289
290        for (rel_path, name) in tf_paths {
291            let path = self.home.join(rel_path);
292            if !path.exists() {
293                continue;
294            }
295
296            let (size, file_count) = calculate_dir_size(&path)?;
297            if size < 10_000_000 {
298                continue;
299            }
300
301            items.push(CleanableItem {
302                name: name.to_string(),
303                category: "ML/AI".to_string(),
304                subcategory: "TensorFlow".to_string(),
305                icon: "📊",
306                path,
307                size,
308                file_count: Some(file_count),
309                last_modified: None,
310                description: "TensorFlow model cache. Can be re-downloaded.",
311                safe_to_delete: SafetyLevel::SafeWithCost,
312                clean_command: None,
313            });
314        }
315
316        Ok(items)
317    }
318
319    /// Detect Jupyter cache and checkpoints
320    fn detect_jupyter(&self) -> Result<Vec<CleanableItem>> {
321        let jupyter_paths = [
322            (".cache/jupyter", "Jupyter Cache"),
323            (".jupyter", "Jupyter Config & Data"),
324            (".local/share/jupyter", "Jupyter Data"),
325        ];
326
327        let mut items = Vec::new();
328
329        for (rel_path, name) in jupyter_paths {
330            let path = self.home.join(rel_path);
331            if !path.exists() {
332                continue;
333            }
334
335            let (size, file_count) = calculate_dir_size(&path)?;
336            if size < 10_000_000 {
337                continue;
338            }
339
340            items.push(CleanableItem {
341                name: name.to_string(),
342                category: "ML/AI".to_string(),
343                subcategory: "Jupyter".to_string(),
344                icon: "📓",
345                path,
346                size,
347                file_count: Some(file_count),
348                last_modified: None,
349                description: "Jupyter notebook cache and runtime data.",
350                safe_to_delete: SafetyLevel::Safe,
351                clean_command: None,
352            });
353        }
354
355        Ok(items)
356    }
357
358    /// Detect LM Studio models
359    fn detect_lmstudio(&self) -> Result<Vec<CleanableItem>> {
360        let lmstudio_path = self.home.join(".lmstudio/models");
361
362        if !lmstudio_path.exists() {
363            // Try alternative location
364            let alt_path = self.home.join(".cache/lm-studio");
365            if !alt_path.exists() {
366                return Ok(vec![]);
367            }
368        }
369
370        let (size, file_count) = calculate_dir_size(&lmstudio_path)?;
371        if size == 0 {
372            return Ok(vec![]);
373        }
374
375        Ok(vec![CleanableItem {
376            name: "LM Studio Models".to_string(),
377            category: "ML/AI".to_string(),
378            subcategory: "LM Studio".to_string(),
379            icon: "🎯",
380            path: lmstudio_path,
381            size,
382            file_count: Some(file_count),
383            last_modified: None,
384            description: "LM Studio downloaded models. Can be re-downloaded.",
385            safe_to_delete: SafetyLevel::SafeWithCost,
386            clean_command: None,
387        }])
388    }
389
390    /// Detect GPT4All models
391    fn detect_gpt4all(&self) -> Result<Vec<CleanableItem>> {
392        let gpt4all_paths = [
393            ".cache/gpt4all",
394            "Library/Application Support/nomic.ai/GPT4All",
395        ];
396
397        for rel_path in gpt4all_paths {
398            let path = self.home.join(rel_path);
399            if !path.exists() {
400                continue;
401            }
402
403            let (size, file_count) = calculate_dir_size(&path)?;
404            if size > 100_000_000 {
405                return Ok(vec![CleanableItem {
406                    name: "GPT4All Models".to_string(),
407                    category: "ML/AI".to_string(),
408                    subcategory: "GPT4All".to_string(),
409                    icon: "🤖",
410                    path,
411                    size,
412                    file_count: Some(file_count),
413                    last_modified: None,
414                    description: "GPT4All downloaded models. Can be re-downloaded.",
415                    safe_to_delete: SafetyLevel::SafeWithCost,
416                    clean_command: None,
417                }]);
418            }
419        }
420
421        Ok(vec![])
422    }
423}
424
425impl Default for MlCleaner {
426    fn default() -> Self {
427        Self::new().expect("MlCleaner requires home directory")
428    }
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_ml_cleaner_creation() {
437        let cleaner = MlCleaner::new();
438        assert!(cleaner.is_some());
439    }
440
441    #[test]
442    fn test_ml_detection() {
443        if let Some(cleaner) = MlCleaner::new() {
444            let items = cleaner.detect().unwrap();
445            println!("Found {} ML items", items.len());
446            for item in &items {
447                println!("  {} {} ({} bytes)", item.icon, item.name, item.size);
448            }
449        }
450    }
451}