Skip to main content

hf_fetch_model/
cache.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2
3//! `HuggingFace` cache directory resolution, model family scanning, disk usage,
4//! and integrity verification.
5//!
6//! [`hf_cache_dir()`] locates the local HF cache. [`list_cached_families()`]
7//! scans downloaded models and groups them by `model_type`.
8//! [`cache_summary()`] provides per-repo size totals,
9//! [`cache_repo_usage()`] returns per-file disk usage for a single repo, and
10//! [`verify_cache()`] re-checks `SHA256` digests of cached files against
11//! `HuggingFace` LFS metadata.
12
13use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use crate::error::FetchError;
17
18/// Reconstructs a repo ID from a `models--org--name` directory name.
19///
20/// Returns `None` if the directory name does not start with `models--`.
21fn repo_id_from_folder_name(dir_name: &str) -> Option<String> {
22    let repo_part = dir_name.strip_prefix("models--")?;
23
24    // Reconstruct repo_id: replace first "--" with "/".
25    let repo_id = match repo_part.find("--") {
26        Some(pos) => {
27            let (org, name_with_sep) = repo_part.split_at(pos);
28            let name = name_with_sep.get(2..).unwrap_or_default();
29            format!("{org}/{name}")
30        }
31        None => repo_part.to_string(),
32    };
33
34    Some(repo_id)
35}
36
37/// Returns the `HuggingFace` Hub cache directory.
38///
39/// Resolution order:
40/// 1. `HF_HOME` environment variable + `/hub`
41/// 2. `~/.cache/huggingface/hub/` (via [`dirs::home_dir()`])
42///
43/// # Errors
44///
45/// Returns [`FetchError::Io`] if the home directory cannot be determined.
46pub fn hf_cache_dir() -> Result<PathBuf, FetchError> {
47    if let Ok(home) = std::env::var("HF_HOME") {
48        let mut path = PathBuf::from(home);
49        path.push("hub");
50        return Ok(path);
51    }
52
53    let home = dirs::home_dir().ok_or_else(|| FetchError::Io {
54        path: PathBuf::from("~"),
55        source: std::io::Error::new(std::io::ErrorKind::NotFound, "home directory not found"),
56    })?;
57
58    let mut path = home;
59    path.push(".cache");
60    path.push("huggingface");
61    path.push("hub");
62    Ok(path)
63}
64
65/// Scans the local HF cache for downloaded models and groups them by `model_type`.
66///
67/// Looks for `config.json` files inside model snapshot directories:
68/// `<cache>/models--<org>--<name>/snapshots/*/config.json`
69///
70/// Returns a map from `model_type` (e.g., `"llama"`) to a sorted list of
71/// repository identifiers (e.g., `["meta-llama/Llama-3.2-1B"]`).
72///
73/// Models without a `model_type` field in their `config.json` are skipped.
74///
75/// # Errors
76///
77/// Returns [`FetchError::Io`] if the cache directory cannot be read.
78pub fn list_cached_families() -> Result<BTreeMap<String, Vec<String>>, FetchError> {
79    let cache_dir = hf_cache_dir()?;
80
81    if !cache_dir.exists() {
82        return Ok(BTreeMap::new());
83    }
84
85    let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
86        path: cache_dir.clone(),
87        source: e,
88    })?;
89
90    let mut families: BTreeMap<String, Vec<String>> = BTreeMap::new();
91
92    for entry in entries {
93        let Ok(entry) = entry else { continue };
94
95        let dir_name = entry.file_name();
96        // BORROW: explicit .to_string_lossy() for OsString → str conversion
97        let dir_str = dir_name.to_string_lossy();
98
99        let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
100            continue;
101        };
102
103        // Find the newest snapshot's config.json
104        let snapshots_dir = crate::cache_layout::snapshots_dir(&entry.path());
105        if !snapshots_dir.exists() {
106            continue;
107        }
108
109        if let Some(model_type) = find_model_type_in_snapshots(&snapshots_dir) {
110            families.entry(model_type).or_default().push(repo_id);
111        }
112    }
113
114    // Sort repo lists within each family for stable output
115    for repos in families.values_mut() {
116        repos.sort();
117    }
118
119    Ok(families)
120}
121
122/// Searches snapshot directories for a `config.json` containing `model_type`.
123///
124/// Returns the first `model_type` value found, or `None`.
125fn find_model_type_in_snapshots(snapshots_dir: &std::path::Path) -> Option<String> {
126    let snapshots = std::fs::read_dir(snapshots_dir).ok()?;
127
128    for snap_entry in snapshots {
129        let Ok(snap_entry) = snap_entry else { continue };
130        let config_path = snap_entry.path().join("config.json");
131
132        if !config_path.exists() {
133            continue;
134        }
135
136        if let Some(model_type) = extract_model_type(&config_path) {
137            return Some(model_type);
138        }
139    }
140
141    None
142}
143
144/// Reads a `config.json` file and extracts the `model_type` field.
145fn extract_model_type(config_path: &std::path::Path) -> Option<String> {
146    let contents = std::fs::read_to_string(config_path).ok()?;
147    // BORROW: explicit .as_str() instead of Deref coercion
148    let value: serde_json::Value = serde_json::from_str(contents.as_str()).ok()?;
149    // BORROW: explicit .as_str() on serde_json Value
150    value.get("model_type")?.as_str().map(String::from)
151}
152
153/// Status of a single file in the cache.
154#[derive(Debug, Clone)]
155#[non_exhaustive]
156pub enum FileStatus {
157    /// File is fully downloaded (local size matches expected size, or no expected size known).
158    Complete {
159        /// Local file size in bytes.
160        local_size: u64,
161    },
162    /// File exists but is smaller than expected (interrupted download),
163    /// or a `.chunked.part` temp file was found in the blobs directory
164    /// (repo-level heuristic — may not correspond to this specific file).
165    Partial {
166        /// Local file size in bytes.
167        local_size: u64,
168        /// Expected file size in bytes.
169        expected_size: u64,
170    },
171    /// File is not present in the cache.
172    Missing {
173        /// Expected file size in bytes (0 if unknown).
174        expected_size: u64,
175    },
176}
177
178/// Cache status report for a repository.
179#[derive(Debug, Clone)]
180pub struct RepoStatus {
181    /// The repository identifier.
182    pub repo_id: String,
183    /// The resolved commit hash (if available).
184    pub commit_hash: Option<String>,
185    /// The cache directory for this repo.
186    pub cache_path: PathBuf,
187    /// Per-file status, sorted by filename.
188    pub files: Vec<(String, FileStatus)>,
189}
190
191impl RepoStatus {
192    /// Number of fully downloaded files.
193    #[must_use]
194    pub fn complete_count(&self) -> usize {
195        self.files
196            .iter()
197            .filter(|(_, s)| matches!(s, FileStatus::Complete { .. }))
198            .count()
199    }
200
201    /// Number of partially downloaded files.
202    #[must_use]
203    pub fn partial_count(&self) -> usize {
204        self.files
205            .iter()
206            .filter(|(_, s)| matches!(s, FileStatus::Partial { .. }))
207            .count()
208    }
209
210    /// Number of missing files.
211    #[must_use]
212    pub fn missing_count(&self) -> usize {
213        self.files
214            .iter()
215            .filter(|(_, s)| matches!(s, FileStatus::Missing { .. }))
216            .count()
217    }
218}
219
220/// Inspects the local cache for a repository and compares against the remote file list.
221///
222/// # Arguments
223///
224/// * `repo_id` — The repository identifier (e.g., `"RWKV/RWKV7-Goose-World3-1.5B-HF"`).
225/// * `token` — Optional authentication token.
226/// * `revision` — Optional revision (defaults to `"main"`).
227///
228/// # Notes
229///
230/// Partial download detection is a repo-level heuristic: if any
231/// `.chunked.part` file exists in the repo's `blobs/` directory, all
232/// missing files are reported as [`FileStatus::Partial`] with the partial
233/// file's size. This may overcount partials when multiple files are
234/// missing but only one has an incomplete blob. Exact blob-to-file
235/// mapping would require LFS metadata.
236///
237/// # Errors
238///
239/// Returns [`FetchError::Http`] if the API request fails.
240/// Returns [`FetchError::Io`] if the cache directory cannot be read.
241pub async fn repo_status(
242    repo_id: &str,
243    token: Option<&str>,
244    revision: Option<&str>,
245) -> Result<RepoStatus, FetchError> {
246    let revision = revision.unwrap_or("main");
247    let cache_dir = hf_cache_dir()?;
248    let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
249
250    // Read commit hash from refs file if available.
251    let commit_hash = read_ref(&repo_dir, revision);
252
253    // Fetch remote file list with sizes.
254    let client = crate::chunked::build_client(token)?;
255    let remote_files =
256        crate::repo::list_repo_files_with_metadata(repo_id, token, Some(revision), &client).await?;
257
258    // Determine snapshot directory.
259    // BORROW: explicit .as_deref() for Option<String> → Option<&str>
260    let snapshot_dir = commit_hash
261        .as_deref()
262        .map(|hash| crate::cache_layout::snapshot_dir(&repo_dir, hash));
263
264    // Pre-check for .chunked.part files in blobs directory (avoids re-scanning
265    // the blobs directory for every missing file in the loop below).
266    let blobs_dir = crate::cache_layout::blobs_dir(&repo_dir);
267    let has_any_partial = has_partial_blob(&blobs_dir);
268
269    // Cross-reference remote files against local state.
270    let mut files: Vec<(String, FileStatus)> = Vec::with_capacity(remote_files.len());
271
272    for remote in &remote_files {
273        let expected_size = remote.size.unwrap_or(0);
274
275        let local_path = snapshot_dir
276            .as_ref()
277            // BORROW: explicit .as_str() for path construction
278            .map(|dir| dir.join(remote.filename.as_str()));
279
280        let status = if let Some(ref path) = local_path {
281            if path.exists() {
282                let local_size = std::fs::metadata(path).map_or(0, |m| m.len());
283
284                if expected_size > 0 && local_size < expected_size {
285                    FileStatus::Partial {
286                        local_size,
287                        expected_size,
288                    }
289                } else {
290                    FileStatus::Complete { local_size }
291                }
292            } else if has_any_partial {
293                // Blobs directory has .chunked.part temp files
294                let part_size = find_partial_blob_size(&blobs_dir);
295                FileStatus::Partial {
296                    local_size: part_size,
297                    expected_size,
298                }
299            } else {
300                FileStatus::Missing { expected_size }
301            }
302        } else {
303            FileStatus::Missing { expected_size }
304        };
305
306        // BORROW: explicit .clone() for owned String
307        files.push((remote.filename.clone(), status));
308    }
309
310    files.sort_by(|(a, _), (b, _)| a.cmp(b));
311
312    // BORROW: explicit .to_owned() for &str → owned String field
313    Ok(RepoStatus {
314        repo_id: repo_id.to_owned(),
315        commit_hash,
316        cache_path: repo_dir,
317        files,
318    })
319}
320
321/// Summary of a single cached model (local-only, no API calls).
322#[derive(Debug, Clone)]
323pub struct CachedModelSummary {
324    /// The repository identifier (e.g., `"RWKV/RWKV7-Goose-World3-1.5B-HF"`).
325    pub repo_id: String,
326    /// Number of files in the snapshot directory.
327    pub file_count: usize,
328    /// Total size on disk in bytes.
329    pub total_size: u64,
330    /// Whether there are incomplete `.chunked.part` temp files.
331    pub has_partial: bool,
332    /// Most recent modification time among files in the snapshot directory.
333    ///
334    /// `None` if no files were found or all metadata reads failed.
335    pub last_modified: Option<std::time::SystemTime>,
336}
337
338/// Scans the entire HF cache and returns a summary for each cached model.
339///
340/// This is a local-only operation (no API calls). It lists all `models--*`
341/// directories and counts files + sizes in each snapshot.
342///
343/// # Errors
344///
345/// Returns [`FetchError::Io`] if the cache directory cannot be read.
346pub fn cache_summary() -> Result<Vec<CachedModelSummary>, FetchError> {
347    let cache_dir = hf_cache_dir()?;
348
349    if !cache_dir.exists() {
350        return Ok(Vec::new());
351    }
352
353    let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
354        path: cache_dir.clone(),
355        source: e,
356    })?;
357
358    let mut summaries: Vec<CachedModelSummary> = Vec::new();
359
360    for entry in entries {
361        let Ok(entry) = entry else { continue };
362        let dir_name = entry.file_name();
363        // BORROW: explicit .to_string_lossy() for OsString → str conversion
364        let dir_str = dir_name.to_string_lossy();
365
366        let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
367            continue;
368        };
369
370        let repo_dir = entry.path();
371
372        // Count files and total size in snapshots.
373        let (file_count, total_size, last_modified) = count_snapshot_files(&repo_dir);
374
375        // Check for partial downloads.
376        let has_partial = find_partial_blob_size(&crate::cache_layout::blobs_dir(&repo_dir)) > 0;
377
378        summaries.push(CachedModelSummary {
379            repo_id,
380            file_count,
381            total_size,
382            has_partial,
383            last_modified,
384        });
385    }
386
387    summaries.sort_by(|a, b| a.repo_id.cmp(&b.repo_id));
388
389    Ok(summaries)
390}
391
392/// Returns the file count and total size for a single cached repo.
393///
394/// Avoids scanning the entire cache when only one repo's metrics are needed
395/// (e.g., for the `cache delete` preview).
396///
397/// # Errors
398///
399/// Returns [`FetchError::Io`] if the cache directory cannot be determined.
400pub fn repo_disk_usage(repo_id: &str) -> Result<(usize, u64), FetchError> {
401    let cache_dir = hf_cache_dir()?;
402    let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
403    let (file_count, total_size, _) = count_snapshot_files(&repo_dir);
404    Ok((file_count, total_size))
405}
406
407/// Checks whether a single cached repo has `.chunked.part` temp files.
408///
409/// Avoids scanning the entire cache when only one repo's partial status
410/// is needed (e.g., for the `du <REPO>` partial-download hint).
411///
412/// # Errors
413///
414/// Returns [`FetchError::Io`] if the cache directory cannot be determined.
415pub fn repo_has_partial(repo_id: &str) -> Result<bool, FetchError> {
416    let cache_dir = hf_cache_dir()?;
417    let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
418    let blobs_dir = crate::cache_layout::blobs_dir(&repo_dir);
419    Ok(find_partial_blob_size(&blobs_dir) > 0)
420}
421
422/// Counts files, total size, and most recent modification time across all
423/// snapshot directories for a repo.
424fn count_snapshot_files(repo_dir: &Path) -> (usize, u64, Option<std::time::SystemTime>) {
425    let snapshots_dir = crate::cache_layout::snapshots_dir(repo_dir);
426    let Ok(snapshots) = std::fs::read_dir(snapshots_dir) else {
427        return (0, 0, None);
428    };
429
430    let mut file_count: usize = 0;
431    let mut total_size: u64 = 0;
432    let mut latest: Option<std::time::SystemTime> = None;
433
434    for snap_entry in snapshots {
435        let Ok(snap_entry) = snap_entry else { continue };
436        let snap_path = snap_entry.path();
437        if !snap_path.is_dir() {
438            continue;
439        }
440        count_files_recursive(&snap_path, &mut file_count, &mut total_size, &mut latest);
441    }
442
443    (file_count, total_size, latest)
444}
445
446/// Recursively counts files, accumulates sizes, and tracks the most recent
447/// modification time in a directory.
448fn count_files_recursive(
449    dir: &Path,
450    count: &mut usize,
451    total: &mut u64,
452    latest: &mut Option<std::time::SystemTime>,
453) {
454    let Ok(entries) = std::fs::read_dir(dir) else {
455        return;
456    };
457
458    for entry in entries {
459        let Ok(entry) = entry else { continue };
460        let path = entry.path();
461        if path.is_dir() {
462            count_files_recursive(&path, count, total, latest);
463        } else if let Ok(meta) = entry.metadata() {
464            *count += 1;
465            *total += meta.len();
466            if let Ok(modified) = meta.modified() {
467                match *latest {
468                    Some(current) if modified <= current => {} // EXPLICIT: current mtime is more recent, keep it
469                    _ => *latest = Some(modified),
470                }
471            }
472        } else {
473            *count += 1;
474        }
475    }
476}
477
478/// Reads the commit hash from a refs file, if it exists.
479///
480/// Looks for `<repo_dir>/refs/<revision>` and returns the trimmed contents
481/// (a commit hash) or `None` if the file does not exist or is empty.
482#[must_use]
483pub fn read_ref(repo_dir: &Path, revision: &str) -> Option<String> {
484    let ref_path = crate::cache_layout::ref_path(repo_dir, revision);
485    std::fs::read_to_string(ref_path)
486        .ok()
487        // BORROW: explicit .to_owned() to convert trimmed &str → owned String
488        .map(|s| s.trim().to_owned())
489        .filter(|s| !s.is_empty())
490}
491
492/// Checks whether any `.chunked.part` temp file exists in the blobs directory.
493///
494/// This is a repo-level heuristic: it cannot map a specific filename to its
495/// blob without full LFS metadata, so it checks for any `.chunked.part` file.
496/// A `true` result means *some* file in the repo has a partial download.
497fn has_partial_blob(blobs_dir: &Path) -> bool {
498    find_partial_blob_size(blobs_dir) > 0
499}
500
501/// Returns the size of the first `.chunked.part` file found in the blobs directory.
502fn find_partial_blob_size(blobs_dir: &Path) -> u64 {
503    let Ok(entries) = std::fs::read_dir(blobs_dir) else {
504        return 0;
505    };
506
507    for entry in entries {
508        let Ok(entry) = entry else { continue };
509        let name = entry.file_name();
510        // BORROW: explicit .to_string_lossy() for OsString → str conversion
511        if name.to_string_lossy().ends_with(".chunked.part") {
512            return entry.metadata().map_or(0, |m| m.len());
513        }
514    }
515
516    0
517}
518
519/// A `.chunked.part` temp file left by an interrupted chunked download.
520#[derive(Debug, Clone)]
521pub struct PartialFile {
522    /// The repository identifier (e.g., `"meta-llama/Llama-3.2-1B"`).
523    pub repo_id: String,
524    /// The `.chunked.part` filename (e.g., `"abc123def456.chunked.part"`).
525    pub filename: String,
526    /// Absolute path to the `.chunked.part` file.
527    pub path: PathBuf,
528    /// Size of the partial file in bytes.
529    pub size: u64,
530}
531
532impl PartialFile {
533    /// Returns sibling sidecar paths that should be removed alongside this
534    /// partial: the resume-state sidecar `{etag}.chunked.part.state` and
535    /// any orphan write-tmp `{etag}.chunked.part.state.tmp` left by an
536    /// interrupted atomic save.
537    ///
538    /// The paths are returned even when the underlying files do not exist
539    /// — callers (`run_cache_clean_partial`) attempt removal best-effort.
540    #[must_use]
541    pub fn sidecar_paths(&self) -> Vec<PathBuf> {
542        let Some(parent) = self.path.parent() else {
543            return Vec::new();
544        };
545        // String concat (mirrors `cache_layout::temp_state_path`'s
546        // rationale): the etag may itself contain periods, so
547        // `Path::with_extension` would truncate at the wrong boundary.
548        // BORROW: explicit .clone() for owned String → mutated copy
549        let mut state_name = self.filename.clone();
550        state_name.push_str(".state");
551        // BORROW: explicit .clone() for owned String → mutated copy
552        let mut tmp_name = self.filename.clone();
553        tmp_name.push_str(".state.tmp");
554        vec![parent.join(state_name), parent.join(tmp_name)]
555    }
556}
557
558/// Finds all `.chunked.part` temp files in the `HuggingFace` cache.
559///
560/// Walks `models--*/blobs/` directories and collects partial files.
561/// When `repo_filter` is `Some`, only the matching repo is scanned.
562///
563/// Returns an empty `Vec` if the cache directory does not exist.
564///
565/// # Errors
566///
567/// Returns [`FetchError::Io`] if the cache directory cannot be read.
568pub fn find_partial_files(repo_filter: Option<&str>) -> Result<Vec<PartialFile>, FetchError> {
569    let cache_dir = hf_cache_dir()?;
570
571    if !cache_dir.exists() {
572        return Ok(Vec::new());
573    }
574
575    let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
576        // BORROW: explicit .clone() for owned PathBuf
577        path: cache_dir.clone(),
578        source: e,
579    })?;
580
581    let mut partials: Vec<PartialFile> = Vec::new();
582
583    for entry in entries {
584        let Ok(entry) = entry else { continue };
585        let dir_name = entry.file_name();
586        // BORROW: explicit .to_string_lossy() for OsString → str conversion
587        let dir_str = dir_name.to_string_lossy();
588
589        let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
590            continue;
591        };
592
593        // Skip repos that don't match the filter.
594        // BORROW: explicit .as_str() instead of Deref coercion
595        if let Some(filter) = repo_filter {
596            if repo_id.as_str() != filter {
597                continue;
598            }
599        }
600
601        let blobs_dir = crate::cache_layout::blobs_dir(&entry.path());
602        let Ok(blob_entries) = std::fs::read_dir(&blobs_dir) else {
603            continue;
604        };
605
606        for blob_entry in blob_entries {
607            let Ok(blob_entry) = blob_entry else { continue };
608            let name = blob_entry.file_name();
609            // BORROW: explicit .to_string_lossy() for OsString → str conversion
610            let name_str = name.to_string_lossy();
611            if name_str.ends_with(".chunked.part") {
612                let size = blob_entry.metadata().map_or(0, |m| m.len());
613                partials.push(PartialFile {
614                    // BORROW: explicit .clone() for owned String
615                    repo_id: repo_id.clone(),
616                    // BORROW: explicit .to_string() for Cow<str> → owned String
617                    filename: name_str.to_string(),
618                    path: blob_entry.path(),
619                    size,
620                });
621            }
622        }
623    }
624
625    Ok(partials)
626}
627
628/// Per-file disk usage entry within a cached repository.
629#[derive(Debug, Clone)]
630pub struct CacheFileUsage {
631    /// Filename relative to the snapshot directory.
632    pub filename: String,
633    /// File size in bytes.
634    pub size: u64,
635}
636
637/// Returns per-file disk usage for a specific cached repository.
638///
639/// Walks the snapshot directories under
640/// `<cache_dir>/models--<org>--<name>/snapshots/` and collects each file's
641/// relative path and size. Results are sorted by size descending.
642///
643/// Returns an empty `Vec` if the repository is not cached.
644///
645/// # Errors
646///
647/// Returns [`FetchError::Io`] if the cache directory cannot be determined.
648pub fn cache_repo_usage(repo_id: &str) -> Result<Vec<CacheFileUsage>, FetchError> {
649    let cache_dir = hf_cache_dir()?;
650    let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
651
652    if !repo_dir.exists() {
653        return Ok(Vec::new());
654    }
655
656    let snapshots_dir = crate::cache_layout::snapshots_dir(&repo_dir);
657    let Ok(snapshots) = std::fs::read_dir(&snapshots_dir) else {
658        return Ok(Vec::new());
659    };
660
661    let mut files: Vec<CacheFileUsage> = Vec::new();
662
663    for snap_entry in snapshots {
664        let Ok(snap_entry) = snap_entry else { continue };
665        let snap_path = snap_entry.path();
666        if !snap_path.is_dir() {
667            continue;
668        }
669        collect_snapshot_files(&snap_path, "", &mut files);
670    }
671
672    files.sort_by_key(|f| std::cmp::Reverse(f.size));
673
674    Ok(files)
675}
676
677/// Recursively collects files from a snapshot directory into `CacheFileUsage` entries.
678///
679/// The `prefix` parameter tracks the relative path from the snapshot root,
680/// so that files in subdirectories get paths like `"tokenizer/vocab.json"`.
681fn collect_snapshot_files(dir: &Path, prefix: &str, files: &mut Vec<CacheFileUsage>) {
682    let Ok(entries) = std::fs::read_dir(dir) else {
683        return;
684    };
685
686    for entry in entries {
687        let Ok(entry) = entry else { continue };
688        let path = entry.path();
689        // BORROW: explicit .to_string_lossy() for OsString → str conversion
690        let name = entry.file_name().to_string_lossy().to_string();
691
692        if path.is_dir() {
693            let child_prefix = if prefix.is_empty() {
694                name
695            } else {
696                format!("{prefix}/{name}")
697            };
698            collect_snapshot_files(&path, &child_prefix, files);
699        } else {
700            let filename = if prefix.is_empty() {
701                name
702            } else {
703                format!("{prefix}/{name}")
704            };
705            let size = entry.metadata().map_or(0, |m| m.len());
706            files.push(CacheFileUsage { filename, size });
707        }
708    }
709}
710
711/// Verification status for a single cached file.
712#[non_exhaustive]
713#[derive(Debug, Clone)]
714pub enum VerifyStatus {
715    /// Local `SHA256` matches the expected hash from `HuggingFace` LFS metadata.
716    Ok,
717    /// Local `SHA256` does not match the expected hash — the cached file is
718    /// corrupted (bit rot, interrupted write, or upstream blob changed).
719    Mismatch {
720        /// Expected `SHA256` hex digest from `HuggingFace` LFS metadata.
721        expected: String,
722        /// Actual `SHA256` hex digest computed from the local file.
723        actual: String,
724    },
725    /// File has no LFS metadata (small git-stored file); verification skipped.
726    Skipped,
727    /// File is absent from the local snapshot directory.
728    Missing,
729}
730
731/// Result of verifying a single cached file against `HuggingFace` LFS metadata.
732#[derive(Debug, Clone)]
733pub struct FileVerification {
734    /// Filename within the repository.
735    pub filename: String,
736    /// File size in bytes — local size when the file is present, otherwise
737    /// the expected size from the API (or `0` when neither is known).
738    pub size: u64,
739    /// Verification result.
740    pub status: VerifyStatus,
741}
742
743/// Streaming progress event emitted by [`verify_cache_with_progress`] so
744/// callers can render per-file feedback during a long verification.
745///
746/// Events fire in this order:
747/// 1. [`VerifyEvent::Started`] — once, after the metadata fetch completes,
748///    before any per-file work begins. Carries the total file count and a
749///    pre-computed maximum filename length so callers can size display
750///    columns up-front.
751/// 2. For each file in alphabetical order:
752///    - [`VerifyEvent::FileStart`] — before the per-file `SHA256`
753///      computation kicks in.
754///    - [`VerifyEvent::FileComplete`] — when the per-file result is known,
755///      carrying the [`VerifyStatus`] outcome.
756#[non_exhaustive]
757#[derive(Debug)]
758pub enum VerifyEvent<'a> {
759    /// Fired once at the start of the run with summary stats useful for
760    /// laying out a streamed table or progress display.
761    Started {
762        /// Total number of files that will be verified.
763        total: usize,
764        /// Maximum filename length across the verification list.
765        max_filename_len: usize,
766    },
767    /// A file is about to be verified.
768    FileStart {
769        /// 1-based index of this file in the verification list.
770        index: usize,
771        /// Total number of files in the verification list.
772        total: usize,
773        /// Filename within the repository.
774        filename: &'a str,
775        /// File size in bytes (local size when present, else expected size).
776        size: u64,
777        /// `true` when the file has LFS metadata (a real `SHA256` computation
778        /// is about to run); `false` when the file is git-stored and will be
779        /// skipped near-instantly.
780        has_lfs: bool,
781    },
782    /// A file's verification has completed.
783    FileComplete {
784        /// 1-based index of this file in the verification list.
785        index: usize,
786        /// Total number of files in the verification list.
787        total: usize,
788        /// Filename within the repository.
789        filename: &'a str,
790        /// File size in bytes (matches the `size` from the corresponding
791        /// [`VerifyEvent::FileStart`]).
792        size: u64,
793        /// The per-file verification result.
794        status: &'a VerifyStatus,
795    },
796}
797
798/// Verifies `SHA256` digests of cached files against `HuggingFace` LFS metadata.
799///
800/// Fetches the expected hashes from the `HuggingFace` API and, for each file
801/// that has an LFS `SHA256`, reads the local cached file and compares.
802///
803/// Files without LFS metadata (small git-stored files such as `config.json`)
804/// are reported as [`VerifyStatus::Skipped`]; files absent from the snapshot
805/// directory are reported as [`VerifyStatus::Missing`]. Both are
806/// non-failures — only [`VerifyStatus::Mismatch`] indicates a corrupted file.
807///
808/// `revision` defaults to `"main"` when `None`. Requires network access for
809/// the metadata fetch; the per-file digest computation is local-only.
810///
811/// For long verifications (multi-GiB safetensors files), prefer
812/// [`verify_cache_with_progress`] so a CLI / GUI can render a spinner or
813/// progress bar while each file is hashed.
814///
815/// # Errors
816///
817/// Returns [`FetchError::Http`] if the `HuggingFace` API request fails.
818/// Returns [`FetchError::Io`] when a local cached file is present but
819/// cannot be read.
820pub async fn verify_cache(
821    repo_id: &str,
822    token: Option<&str>,
823    revision: Option<&str>,
824) -> Result<Vec<FileVerification>, FetchError> {
825    verify_cache_with_progress(repo_id, token, revision, |_| {}).await
826}
827
828/// Same as [`verify_cache`] but emits [`VerifyEvent`]s through `on_event`
829/// so callers can render streaming progress (e.g. a spinner per file).
830///
831/// The callback runs on the same task as the verification — keep it short.
832/// Use interior mutability ([`std::cell::Cell`], [`std::cell::RefCell`]) if
833/// you need to track state across events; the closure may capture by shared
834/// reference because the API requires only [`Fn`].
835///
836/// Files are processed in alphabetical order by filename so that streamed
837/// output remains stable across runs and matches the sort order of the
838/// returned [`Vec<FileVerification>`].
839///
840/// # Errors
841///
842/// Same error conditions as [`verify_cache`].
843pub async fn verify_cache_with_progress<F>(
844    repo_id: &str,
845    token: Option<&str>,
846    revision: Option<&str>,
847    on_event: F,
848) -> Result<Vec<FileVerification>, FetchError>
849where
850    F: Fn(VerifyEvent<'_>),
851{
852    let revision = revision.unwrap_or("main");
853    let cache_dir = hf_cache_dir()?;
854    let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
855
856    let commit_hash = read_ref(&repo_dir, revision);
857
858    let client = crate::chunked::build_client(token)?;
859    let mut remote_files =
860        crate::repo::list_repo_files_with_metadata(repo_id, token, Some(revision), &client).await?;
861
862    // Sort up-front so streamed output is stable across runs and matches the
863    // returned `Vec<FileVerification>`'s order.
864    remote_files.sort_by(|a, b| a.filename.cmp(&b.filename));
865
866    // BORROW: explicit .as_deref() for Option<String> → Option<&str>
867    let snapshot_dir = commit_hash
868        .as_deref()
869        .map(|hash| crate::cache_layout::snapshot_dir(&repo_dir, hash));
870
871    let total = remote_files.len();
872    let max_filename_len = remote_files
873        .iter()
874        .map(|f| f.filename.len())
875        .max()
876        .unwrap_or(0);
877
878    on_event(VerifyEvent::Started {
879        total,
880        max_filename_len,
881    });
882
883    let mut results: Vec<FileVerification> = Vec::with_capacity(total);
884
885    for (i, remote) in remote_files.iter().enumerate() {
886        let index = i + 1;
887        let local_path = snapshot_dir
888            .as_ref()
889            // BORROW: explicit .as_str() for path construction
890            .map(|dir| dir.join(remote.filename.as_str()));
891
892        let exists = local_path.as_ref().is_some_and(|p| p.exists());
893        let local_size = local_path
894            .as_ref()
895            .filter(|_| exists)
896            .and_then(|p| std::fs::metadata(p).ok().map(|m| m.len()))
897            .unwrap_or(0);
898        let expected_size = remote.size.unwrap_or(0);
899        let display_size = if exists { local_size } else { expected_size };
900
901        let has_lfs = remote.sha256.is_some();
902        on_event(VerifyEvent::FileStart {
903            index,
904            total,
905            // BORROW: explicit .as_str() for &String → &str argument
906            filename: remote.filename.as_str(),
907            size: display_size,
908            has_lfs,
909        });
910
911        let status = match (remote.sha256.as_deref(), local_path.as_deref(), exists) {
912            (None, _, _) => VerifyStatus::Skipped,
913            (Some(_), None, _) | (Some(_), Some(_), false) => VerifyStatus::Missing,
914            (Some(expected), Some(path), true) => {
915                // BORROW: explicit .as_str() for &String → &str argument
916                match crate::checksum::verify_sha256(path, remote.filename.as_str(), expected).await
917                {
918                    Ok(()) => VerifyStatus::Ok,
919                    Err(FetchError::Checksum {
920                        expected, actual, ..
921                    }) => VerifyStatus::Mismatch { expected, actual },
922                    Err(e) => return Err(e),
923                }
924            }
925        };
926
927        on_event(VerifyEvent::FileComplete {
928            index,
929            total,
930            // BORROW: explicit .as_str() for &String → &str argument
931            filename: remote.filename.as_str(),
932            size: display_size,
933            status: &status,
934        });
935
936        results.push(FileVerification {
937            // BORROW: explicit .clone() for owned String
938            filename: remote.filename.clone(),
939            size: display_size,
940            status,
941        });
942    }
943
944    Ok(results)
945}
946
947#[cfg(test)]
948mod tests {
949    #![allow(
950        clippy::panic,
951        clippy::unwrap_used,
952        clippy::expect_used,
953        clippy::indexing_slicing
954    )]
955
956    use super::*;
957
958    fn sample_partial(filename: &str) -> PartialFile {
959        PartialFile {
960            repo_id: "org/model".to_owned(),
961            filename: filename.to_owned(),
962            path: PathBuf::from("/tmp/models--org--model/blobs").join(filename),
963            size: 1024,
964        }
965    }
966
967    #[test]
968    fn sidecar_paths_returns_state_and_state_tmp() {
969        let p = sample_partial("abc123.chunked.part");
970        let sidecars = p.sidecar_paths();
971
972        assert_eq!(sidecars.len(), 2);
973        assert_eq!(
974            sidecars[0],
975            PathBuf::from("/tmp/models--org--model/blobs/abc123.chunked.part.state")
976        );
977        assert_eq!(
978            sidecars[1],
979            PathBuf::from("/tmp/models--org--model/blobs/abc123.chunked.part.state.tmp")
980        );
981    }
982
983    #[test]
984    fn sidecar_paths_handles_etag_with_periods() {
985        // Same period-handling rationale as `cache_layout::temp_state_path`:
986        // the etag may itself contain dots, so naive `Path::with_extension`
987        // would chop at the wrong boundary.
988        let p = sample_partial("abc.def.chunked.part");
989        let sidecars = p.sidecar_paths();
990
991        assert_eq!(
992            sidecars[0],
993            PathBuf::from("/tmp/models--org--model/blobs/abc.def.chunked.part.state")
994        );
995        assert_eq!(
996            sidecars[1],
997            PathBuf::from("/tmp/models--org--model/blobs/abc.def.chunked.part.state.tmp")
998        );
999    }
1000}