hf_fetch_model/cache.rs
1// SPDX-License-Identifier: MIT OR Apache-2.0
2
3//! `HuggingFace` cache directory resolution, model family scanning, disk usage,
4//! and integrity verification.
5//!
6//! [`hf_cache_dir()`] locates the local HF cache. [`list_cached_families()`]
7//! scans downloaded models and groups them by `model_type`.
8//! [`cache_summary()`] provides per-repo size totals,
9//! [`cache_repo_usage()`] returns per-file disk usage for a single repo, and
10//! [`verify_cache()`] re-checks `SHA256` digests of cached files against
11//! `HuggingFace` LFS metadata.
12
13use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use crate::error::FetchError;
17
18/// Reconstructs a repo ID from a `models--org--name` directory name.
19///
20/// Returns `None` if the directory name does not start with `models--`.
21fn repo_id_from_folder_name(dir_name: &str) -> Option<String> {
22 let repo_part = dir_name.strip_prefix("models--")?;
23
24 // Reconstruct repo_id: replace first "--" with "/".
25 let repo_id = match repo_part.find("--") {
26 Some(pos) => {
27 let (org, name_with_sep) = repo_part.split_at(pos);
28 let name = name_with_sep.get(2..).unwrap_or_default();
29 format!("{org}/{name}")
30 }
31 None => repo_part.to_string(),
32 };
33
34 Some(repo_id)
35}
36
37/// Returns the `HuggingFace` Hub cache directory.
38///
39/// Resolution order:
40/// 1. `HF_HOME` environment variable + `/hub`
41/// 2. `~/.cache/huggingface/hub/` (via [`dirs::home_dir()`])
42///
43/// # Errors
44///
45/// Returns [`FetchError::Io`] if the home directory cannot be determined.
46pub fn hf_cache_dir() -> Result<PathBuf, FetchError> {
47 if let Ok(home) = std::env::var("HF_HOME") {
48 let mut path = PathBuf::from(home);
49 path.push("hub");
50 return Ok(path);
51 }
52
53 let home = dirs::home_dir().ok_or_else(|| FetchError::Io {
54 path: PathBuf::from("~"),
55 source: std::io::Error::new(std::io::ErrorKind::NotFound, "home directory not found"),
56 })?;
57
58 let mut path = home;
59 path.push(".cache");
60 path.push("huggingface");
61 path.push("hub");
62 Ok(path)
63}
64
65/// Scans the local HF cache for downloaded models and groups them by `model_type`.
66///
67/// Looks for `config.json` files inside model snapshot directories:
68/// `<cache>/models--<org>--<name>/snapshots/*/config.json`
69///
70/// Returns a map from `model_type` (e.g., `"llama"`) to a sorted list of
71/// repository identifiers (e.g., `["meta-llama/Llama-3.2-1B"]`).
72///
73/// Models without a `model_type` field in their `config.json` are skipped.
74///
75/// # Errors
76///
77/// Returns [`FetchError::Io`] if the cache directory cannot be read.
78pub fn list_cached_families() -> Result<BTreeMap<String, Vec<String>>, FetchError> {
79 let cache_dir = hf_cache_dir()?;
80
81 if !cache_dir.exists() {
82 return Ok(BTreeMap::new());
83 }
84
85 let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
86 path: cache_dir.clone(),
87 source: e,
88 })?;
89
90 let mut families: BTreeMap<String, Vec<String>> = BTreeMap::new();
91
92 for entry in entries {
93 let Ok(entry) = entry else { continue };
94
95 let dir_name = entry.file_name();
96 // BORROW: explicit .to_string_lossy() for OsString → str conversion
97 let dir_str = dir_name.to_string_lossy();
98
99 let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
100 continue;
101 };
102
103 // Find the newest snapshot's config.json
104 let snapshots_dir = crate::cache_layout::snapshots_dir(&entry.path());
105 if !snapshots_dir.exists() {
106 continue;
107 }
108
109 if let Some(model_type) = find_model_type_in_snapshots(&snapshots_dir) {
110 families.entry(model_type).or_default().push(repo_id);
111 }
112 }
113
114 // Sort repo lists within each family for stable output
115 for repos in families.values_mut() {
116 repos.sort();
117 }
118
119 Ok(families)
120}
121
122/// Searches snapshot directories for a `config.json` containing `model_type`.
123///
124/// Returns the first `model_type` value found, or `None`.
125fn find_model_type_in_snapshots(snapshots_dir: &std::path::Path) -> Option<String> {
126 let snapshots = std::fs::read_dir(snapshots_dir).ok()?;
127
128 for snap_entry in snapshots {
129 let Ok(snap_entry) = snap_entry else { continue };
130 let config_path = snap_entry.path().join("config.json");
131
132 if !config_path.exists() {
133 continue;
134 }
135
136 if let Some(model_type) = extract_model_type(&config_path) {
137 return Some(model_type);
138 }
139 }
140
141 None
142}
143
144/// Reads a `config.json` file and extracts the `model_type` field.
145fn extract_model_type(config_path: &std::path::Path) -> Option<String> {
146 let contents = std::fs::read_to_string(config_path).ok()?;
147 // BORROW: explicit .as_str() instead of Deref coercion
148 let value: serde_json::Value = serde_json::from_str(contents.as_str()).ok()?;
149 // BORROW: explicit .as_str() on serde_json Value
150 value.get("model_type")?.as_str().map(String::from)
151}
152
153/// Status of a single file in the cache.
154#[derive(Debug, Clone)]
155#[non_exhaustive]
156pub enum FileStatus {
157 /// File is fully downloaded (local size matches expected size, or no expected size known).
158 Complete {
159 /// Local file size in bytes.
160 local_size: u64,
161 },
162 /// File exists but is smaller than expected (interrupted download),
163 /// or a `.chunked.part` temp file was found in the blobs directory
164 /// (repo-level heuristic — may not correspond to this specific file).
165 Partial {
166 /// Local file size in bytes.
167 local_size: u64,
168 /// Expected file size in bytes.
169 expected_size: u64,
170 },
171 /// File is not present in the cache.
172 Missing {
173 /// Expected file size in bytes (0 if unknown).
174 expected_size: u64,
175 },
176}
177
178/// Cache status report for a repository.
179#[derive(Debug, Clone)]
180pub struct RepoStatus {
181 /// The repository identifier.
182 pub repo_id: String,
183 /// The resolved commit hash (if available).
184 pub commit_hash: Option<String>,
185 /// The cache directory for this repo.
186 pub cache_path: PathBuf,
187 /// Per-file status, sorted by filename.
188 pub files: Vec<(String, FileStatus)>,
189}
190
191impl RepoStatus {
192 /// Number of fully downloaded files.
193 #[must_use]
194 pub fn complete_count(&self) -> usize {
195 self.files
196 .iter()
197 .filter(|(_, s)| matches!(s, FileStatus::Complete { .. }))
198 .count()
199 }
200
201 /// Number of partially downloaded files.
202 #[must_use]
203 pub fn partial_count(&self) -> usize {
204 self.files
205 .iter()
206 .filter(|(_, s)| matches!(s, FileStatus::Partial { .. }))
207 .count()
208 }
209
210 /// Number of missing files.
211 #[must_use]
212 pub fn missing_count(&self) -> usize {
213 self.files
214 .iter()
215 .filter(|(_, s)| matches!(s, FileStatus::Missing { .. }))
216 .count()
217 }
218}
219
220/// Inspects the local cache for a repository and compares against the remote file list.
221///
222/// # Arguments
223///
224/// * `repo_id` — The repository identifier (e.g., `"RWKV/RWKV7-Goose-World3-1.5B-HF"`).
225/// * `token` — Optional authentication token.
226/// * `revision` — Optional revision (defaults to `"main"`).
227///
228/// # Notes
229///
230/// Partial download detection is a repo-level heuristic: if any
231/// `.chunked.part` file exists in the repo's `blobs/` directory, all
232/// missing files are reported as [`FileStatus::Partial`] with the partial
233/// file's size. This may overcount partials when multiple files are
234/// missing but only one has an incomplete blob. Exact blob-to-file
235/// mapping would require LFS metadata.
236///
237/// # Errors
238///
239/// Returns [`FetchError::Http`] if the API request fails.
240/// Returns [`FetchError::Io`] if the cache directory cannot be read.
241pub async fn repo_status(
242 repo_id: &str,
243 token: Option<&str>,
244 revision: Option<&str>,
245) -> Result<RepoStatus, FetchError> {
246 let revision = revision.unwrap_or("main");
247 let cache_dir = hf_cache_dir()?;
248 let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
249
250 // Read commit hash from refs file if available.
251 let commit_hash = read_ref(&repo_dir, revision);
252
253 // Fetch remote file list with sizes.
254 let client = crate::chunked::build_client(token)?;
255 let remote_files =
256 crate::repo::list_repo_files_with_metadata(repo_id, token, Some(revision), &client).await?;
257
258 // Determine snapshot directory.
259 // BORROW: explicit .as_deref() for Option<String> → Option<&str>
260 let snapshot_dir = commit_hash
261 .as_deref()
262 .map(|hash| crate::cache_layout::snapshot_dir(&repo_dir, hash));
263
264 // Pre-check for .chunked.part files in blobs directory (avoids re-scanning
265 // the blobs directory for every missing file in the loop below).
266 let blobs_dir = crate::cache_layout::blobs_dir(&repo_dir);
267 let has_any_partial = has_partial_blob(&blobs_dir);
268
269 // Cross-reference remote files against local state.
270 let mut files: Vec<(String, FileStatus)> = Vec::with_capacity(remote_files.len());
271
272 for remote in &remote_files {
273 let expected_size = remote.size.unwrap_or(0);
274
275 let local_path = snapshot_dir
276 .as_ref()
277 // BORROW: explicit .as_str() for path construction
278 .map(|dir| dir.join(remote.filename.as_str()));
279
280 let status = if let Some(ref path) = local_path {
281 if path.exists() {
282 let local_size = std::fs::metadata(path).map_or(0, |m| m.len());
283
284 if expected_size > 0 && local_size < expected_size {
285 FileStatus::Partial {
286 local_size,
287 expected_size,
288 }
289 } else {
290 FileStatus::Complete { local_size }
291 }
292 } else if has_any_partial {
293 // Blobs directory has .chunked.part temp files
294 let part_size = find_partial_blob_size(&blobs_dir);
295 FileStatus::Partial {
296 local_size: part_size,
297 expected_size,
298 }
299 } else {
300 FileStatus::Missing { expected_size }
301 }
302 } else {
303 FileStatus::Missing { expected_size }
304 };
305
306 // BORROW: explicit .clone() for owned String
307 files.push((remote.filename.clone(), status));
308 }
309
310 files.sort_by(|(a, _), (b, _)| a.cmp(b));
311
312 // BORROW: explicit .to_owned() for &str → owned String field
313 Ok(RepoStatus {
314 repo_id: repo_id.to_owned(),
315 commit_hash,
316 cache_path: repo_dir,
317 files,
318 })
319}
320
321/// Summary of a single cached model (local-only, no API calls).
322#[derive(Debug, Clone)]
323pub struct CachedModelSummary {
324 /// The repository identifier (e.g., `"RWKV/RWKV7-Goose-World3-1.5B-HF"`).
325 pub repo_id: String,
326 /// Number of files in the snapshot directory.
327 pub file_count: usize,
328 /// Total size on disk in bytes.
329 pub total_size: u64,
330 /// Whether there are incomplete `.chunked.part` temp files.
331 pub has_partial: bool,
332 /// Most recent modification time among files in the snapshot directory.
333 ///
334 /// `None` if no files were found or all metadata reads failed.
335 pub last_modified: Option<std::time::SystemTime>,
336}
337
338/// Scans the entire HF cache and returns a summary for each cached model.
339///
340/// This is a local-only operation (no API calls). It lists all `models--*`
341/// directories and counts files + sizes in each snapshot.
342///
343/// # Errors
344///
345/// Returns [`FetchError::Io`] if the cache directory cannot be read.
346pub fn cache_summary() -> Result<Vec<CachedModelSummary>, FetchError> {
347 let cache_dir = hf_cache_dir()?;
348
349 if !cache_dir.exists() {
350 return Ok(Vec::new());
351 }
352
353 let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
354 path: cache_dir.clone(),
355 source: e,
356 })?;
357
358 let mut summaries: Vec<CachedModelSummary> = Vec::new();
359
360 for entry in entries {
361 let Ok(entry) = entry else { continue };
362 let dir_name = entry.file_name();
363 // BORROW: explicit .to_string_lossy() for OsString → str conversion
364 let dir_str = dir_name.to_string_lossy();
365
366 let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
367 continue;
368 };
369
370 let repo_dir = entry.path();
371
372 // Count files and total size in snapshots.
373 let (file_count, total_size, last_modified) = count_snapshot_files(&repo_dir);
374
375 // Check for partial downloads.
376 let has_partial = find_partial_blob_size(&crate::cache_layout::blobs_dir(&repo_dir)) > 0;
377
378 summaries.push(CachedModelSummary {
379 repo_id,
380 file_count,
381 total_size,
382 has_partial,
383 last_modified,
384 });
385 }
386
387 summaries.sort_by(|a, b| a.repo_id.cmp(&b.repo_id));
388
389 Ok(summaries)
390}
391
392/// Returns the file count and total size for a single cached repo.
393///
394/// Avoids scanning the entire cache when only one repo's metrics are needed
395/// (e.g., for the `cache delete` preview).
396///
397/// # Errors
398///
399/// Returns [`FetchError::Io`] if the cache directory cannot be determined.
400pub fn repo_disk_usage(repo_id: &str) -> Result<(usize, u64), FetchError> {
401 let cache_dir = hf_cache_dir()?;
402 let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
403 let (file_count, total_size, _) = count_snapshot_files(&repo_dir);
404 Ok((file_count, total_size))
405}
406
407/// Checks whether a single cached repo has `.chunked.part` temp files.
408///
409/// Avoids scanning the entire cache when only one repo's partial status
410/// is needed (e.g., for the `du <REPO>` partial-download hint).
411///
412/// # Errors
413///
414/// Returns [`FetchError::Io`] if the cache directory cannot be determined.
415pub fn repo_has_partial(repo_id: &str) -> Result<bool, FetchError> {
416 let cache_dir = hf_cache_dir()?;
417 let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
418 let blobs_dir = crate::cache_layout::blobs_dir(&repo_dir);
419 Ok(find_partial_blob_size(&blobs_dir) > 0)
420}
421
422/// Counts files, total size, and most recent modification time across all
423/// snapshot directories for a repo.
424fn count_snapshot_files(repo_dir: &Path) -> (usize, u64, Option<std::time::SystemTime>) {
425 let snapshots_dir = crate::cache_layout::snapshots_dir(repo_dir);
426 let Ok(snapshots) = std::fs::read_dir(snapshots_dir) else {
427 return (0, 0, None);
428 };
429
430 let mut file_count: usize = 0;
431 let mut total_size: u64 = 0;
432 let mut latest: Option<std::time::SystemTime> = None;
433
434 for snap_entry in snapshots {
435 let Ok(snap_entry) = snap_entry else { continue };
436 let snap_path = snap_entry.path();
437 if !snap_path.is_dir() {
438 continue;
439 }
440 count_files_recursive(&snap_path, &mut file_count, &mut total_size, &mut latest);
441 }
442
443 (file_count, total_size, latest)
444}
445
446/// Recursively counts files, accumulates sizes, and tracks the most recent
447/// modification time in a directory.
448fn count_files_recursive(
449 dir: &Path,
450 count: &mut usize,
451 total: &mut u64,
452 latest: &mut Option<std::time::SystemTime>,
453) {
454 let Ok(entries) = std::fs::read_dir(dir) else {
455 return;
456 };
457
458 for entry in entries {
459 let Ok(entry) = entry else { continue };
460 let path = entry.path();
461 if path.is_dir() {
462 count_files_recursive(&path, count, total, latest);
463 } else if let Ok(meta) = entry.metadata() {
464 *count += 1;
465 *total += meta.len();
466 if let Ok(modified) = meta.modified() {
467 match *latest {
468 Some(current) if modified <= current => {} // EXPLICIT: current mtime is more recent, keep it
469 _ => *latest = Some(modified),
470 }
471 }
472 } else {
473 *count += 1;
474 }
475 }
476}
477
478/// Reads the commit hash from a refs file, if it exists.
479///
480/// Looks for `<repo_dir>/refs/<revision>` and returns the trimmed contents
481/// (a commit hash) or `None` if the file does not exist or is empty.
482#[must_use]
483pub fn read_ref(repo_dir: &Path, revision: &str) -> Option<String> {
484 let ref_path = crate::cache_layout::ref_path(repo_dir, revision);
485 std::fs::read_to_string(ref_path)
486 .ok()
487 // BORROW: explicit .to_owned() to convert trimmed &str → owned String
488 .map(|s| s.trim().to_owned())
489 .filter(|s| !s.is_empty())
490}
491
492/// Checks whether any `.chunked.part` temp file exists in the blobs directory.
493///
494/// This is a repo-level heuristic: it cannot map a specific filename to its
495/// blob without full LFS metadata, so it checks for any `.chunked.part` file.
496/// A `true` result means *some* file in the repo has a partial download.
497fn has_partial_blob(blobs_dir: &Path) -> bool {
498 find_partial_blob_size(blobs_dir) > 0
499}
500
501/// Returns the size of the first `.chunked.part` file found in the blobs directory.
502fn find_partial_blob_size(blobs_dir: &Path) -> u64 {
503 let Ok(entries) = std::fs::read_dir(blobs_dir) else {
504 return 0;
505 };
506
507 for entry in entries {
508 let Ok(entry) = entry else { continue };
509 let name = entry.file_name();
510 // BORROW: explicit .to_string_lossy() for OsString → str conversion
511 if name.to_string_lossy().ends_with(".chunked.part") {
512 return entry.metadata().map_or(0, |m| m.len());
513 }
514 }
515
516 0
517}
518
519/// A `.chunked.part` temp file left by an interrupted chunked download.
520#[derive(Debug, Clone)]
521pub struct PartialFile {
522 /// The repository identifier (e.g., `"meta-llama/Llama-3.2-1B"`).
523 pub repo_id: String,
524 /// The `.chunked.part` filename (e.g., `"abc123def456.chunked.part"`).
525 pub filename: String,
526 /// Absolute path to the `.chunked.part` file.
527 pub path: PathBuf,
528 /// Size of the partial file in bytes.
529 pub size: u64,
530}
531
532impl PartialFile {
533 /// Returns sibling sidecar paths that should be removed alongside this
534 /// partial: the resume-state sidecar `{etag}.chunked.part.state` and
535 /// any orphan write-tmp `{etag}.chunked.part.state.tmp` left by an
536 /// interrupted atomic save.
537 ///
538 /// The paths are returned even when the underlying files do not exist
539 /// — callers (`run_cache_clean_partial`) attempt removal best-effort.
540 #[must_use]
541 pub fn sidecar_paths(&self) -> Vec<PathBuf> {
542 let Some(parent) = self.path.parent() else {
543 return Vec::new();
544 };
545 // String concat (mirrors `cache_layout::temp_state_path`'s
546 // rationale): the etag may itself contain periods, so
547 // `Path::with_extension` would truncate at the wrong boundary.
548 // BORROW: explicit .clone() for owned String → mutated copy
549 let mut state_name = self.filename.clone();
550 state_name.push_str(".state");
551 // BORROW: explicit .clone() for owned String → mutated copy
552 let mut tmp_name = self.filename.clone();
553 tmp_name.push_str(".state.tmp");
554 vec![parent.join(state_name), parent.join(tmp_name)]
555 }
556}
557
558/// Finds all `.chunked.part` temp files in the `HuggingFace` cache.
559///
560/// Walks `models--*/blobs/` directories and collects partial files.
561/// When `repo_filter` is `Some`, only the matching repo is scanned.
562///
563/// Returns an empty `Vec` if the cache directory does not exist.
564///
565/// # Errors
566///
567/// Returns [`FetchError::Io`] if the cache directory cannot be read.
568pub fn find_partial_files(repo_filter: Option<&str>) -> Result<Vec<PartialFile>, FetchError> {
569 let cache_dir = hf_cache_dir()?;
570
571 if !cache_dir.exists() {
572 return Ok(Vec::new());
573 }
574
575 let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
576 // BORROW: explicit .clone() for owned PathBuf
577 path: cache_dir.clone(),
578 source: e,
579 })?;
580
581 let mut partials: Vec<PartialFile> = Vec::new();
582
583 for entry in entries {
584 let Ok(entry) = entry else { continue };
585 let dir_name = entry.file_name();
586 // BORROW: explicit .to_string_lossy() for OsString → str conversion
587 let dir_str = dir_name.to_string_lossy();
588
589 let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
590 continue;
591 };
592
593 // Skip repos that don't match the filter.
594 // BORROW: explicit .as_str() instead of Deref coercion
595 if let Some(filter) = repo_filter {
596 if repo_id.as_str() != filter {
597 continue;
598 }
599 }
600
601 let blobs_dir = crate::cache_layout::blobs_dir(&entry.path());
602 let Ok(blob_entries) = std::fs::read_dir(&blobs_dir) else {
603 continue;
604 };
605
606 for blob_entry in blob_entries {
607 let Ok(blob_entry) = blob_entry else { continue };
608 let name = blob_entry.file_name();
609 // BORROW: explicit .to_string_lossy() for OsString → str conversion
610 let name_str = name.to_string_lossy();
611 if name_str.ends_with(".chunked.part") {
612 let size = blob_entry.metadata().map_or(0, |m| m.len());
613 partials.push(PartialFile {
614 // BORROW: explicit .clone() for owned String
615 repo_id: repo_id.clone(),
616 // BORROW: explicit .to_string() for Cow<str> → owned String
617 filename: name_str.to_string(),
618 path: blob_entry.path(),
619 size,
620 });
621 }
622 }
623 }
624
625 Ok(partials)
626}
627
628/// Per-file disk usage entry within a cached repository.
629#[derive(Debug, Clone)]
630pub struct CacheFileUsage {
631 /// Filename relative to the snapshot directory.
632 pub filename: String,
633 /// File size in bytes.
634 pub size: u64,
635}
636
637/// Returns per-file disk usage for a specific cached repository.
638///
639/// Walks the snapshot directories under
640/// `<cache_dir>/models--<org>--<name>/snapshots/` and collects each file's
641/// relative path and size. Results are sorted by size descending.
642///
643/// Returns an empty `Vec` if the repository is not cached.
644///
645/// # Errors
646///
647/// Returns [`FetchError::Io`] if the cache directory cannot be determined.
648pub fn cache_repo_usage(repo_id: &str) -> Result<Vec<CacheFileUsage>, FetchError> {
649 let cache_dir = hf_cache_dir()?;
650 let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
651
652 if !repo_dir.exists() {
653 return Ok(Vec::new());
654 }
655
656 let snapshots_dir = crate::cache_layout::snapshots_dir(&repo_dir);
657 let Ok(snapshots) = std::fs::read_dir(&snapshots_dir) else {
658 return Ok(Vec::new());
659 };
660
661 let mut files: Vec<CacheFileUsage> = Vec::new();
662
663 for snap_entry in snapshots {
664 let Ok(snap_entry) = snap_entry else { continue };
665 let snap_path = snap_entry.path();
666 if !snap_path.is_dir() {
667 continue;
668 }
669 collect_snapshot_files(&snap_path, "", &mut files);
670 }
671
672 files.sort_by_key(|f| std::cmp::Reverse(f.size));
673
674 Ok(files)
675}
676
677/// Recursively collects files from a snapshot directory into `CacheFileUsage` entries.
678///
679/// The `prefix` parameter tracks the relative path from the snapshot root,
680/// so that files in subdirectories get paths like `"tokenizer/vocab.json"`.
681fn collect_snapshot_files(dir: &Path, prefix: &str, files: &mut Vec<CacheFileUsage>) {
682 let Ok(entries) = std::fs::read_dir(dir) else {
683 return;
684 };
685
686 for entry in entries {
687 let Ok(entry) = entry else { continue };
688 let path = entry.path();
689 // BORROW: explicit .to_string_lossy() for OsString → str conversion
690 let name = entry.file_name().to_string_lossy().to_string();
691
692 if path.is_dir() {
693 let child_prefix = if prefix.is_empty() {
694 name
695 } else {
696 format!("{prefix}/{name}")
697 };
698 collect_snapshot_files(&path, &child_prefix, files);
699 } else {
700 let filename = if prefix.is_empty() {
701 name
702 } else {
703 format!("{prefix}/{name}")
704 };
705 let size = entry.metadata().map_or(0, |m| m.len());
706 files.push(CacheFileUsage { filename, size });
707 }
708 }
709}
710
711/// Verification status for a single cached file.
712#[non_exhaustive]
713#[derive(Debug, Clone)]
714pub enum VerifyStatus {
715 /// Local `SHA256` matches the expected hash from `HuggingFace` LFS metadata.
716 Ok,
717 /// Local `SHA256` does not match the expected hash — the cached file is
718 /// corrupted (bit rot, interrupted write, or upstream blob changed).
719 Mismatch {
720 /// Expected `SHA256` hex digest from `HuggingFace` LFS metadata.
721 expected: String,
722 /// Actual `SHA256` hex digest computed from the local file.
723 actual: String,
724 },
725 /// File has no LFS metadata (small git-stored file); verification skipped.
726 Skipped,
727 /// File is absent from the local snapshot directory.
728 Missing,
729}
730
731/// Result of verifying a single cached file against `HuggingFace` LFS metadata.
732#[derive(Debug, Clone)]
733pub struct FileVerification {
734 /// Filename within the repository.
735 pub filename: String,
736 /// File size in bytes — local size when the file is present, otherwise
737 /// the expected size from the API (or `0` when neither is known).
738 pub size: u64,
739 /// Verification result.
740 pub status: VerifyStatus,
741}
742
743/// Streaming progress event emitted by [`verify_cache_with_progress`] so
744/// callers can render per-file feedback during a long verification.
745///
746/// Events fire in this order:
747/// 1. [`VerifyEvent::Started`] — once, after the metadata fetch completes,
748/// before any per-file work begins. Carries the total file count and a
749/// pre-computed maximum filename length so callers can size display
750/// columns up-front.
751/// 2. For each file in alphabetical order:
752/// - [`VerifyEvent::FileStart`] — before the per-file `SHA256`
753/// computation kicks in.
754/// - [`VerifyEvent::FileComplete`] — when the per-file result is known,
755/// carrying the [`VerifyStatus`] outcome.
756#[non_exhaustive]
757#[derive(Debug)]
758pub enum VerifyEvent<'a> {
759 /// Fired once at the start of the run with summary stats useful for
760 /// laying out a streamed table or progress display.
761 Started {
762 /// Total number of files that will be verified.
763 total: usize,
764 /// Maximum filename length across the verification list.
765 max_filename_len: usize,
766 },
767 /// A file is about to be verified.
768 FileStart {
769 /// 1-based index of this file in the verification list.
770 index: usize,
771 /// Total number of files in the verification list.
772 total: usize,
773 /// Filename within the repository.
774 filename: &'a str,
775 /// File size in bytes (local size when present, else expected size).
776 size: u64,
777 /// `true` when the file has LFS metadata (a real `SHA256` computation
778 /// is about to run); `false` when the file is git-stored and will be
779 /// skipped near-instantly.
780 has_lfs: bool,
781 },
782 /// A file's verification has completed.
783 FileComplete {
784 /// 1-based index of this file in the verification list.
785 index: usize,
786 /// Total number of files in the verification list.
787 total: usize,
788 /// Filename within the repository.
789 filename: &'a str,
790 /// File size in bytes (matches the `size` from the corresponding
791 /// [`VerifyEvent::FileStart`]).
792 size: u64,
793 /// The per-file verification result.
794 status: &'a VerifyStatus,
795 },
796}
797
798/// Verifies `SHA256` digests of cached files against `HuggingFace` LFS metadata.
799///
800/// Fetches the expected hashes from the `HuggingFace` API and, for each file
801/// that has an LFS `SHA256`, reads the local cached file and compares.
802///
803/// Files without LFS metadata (small git-stored files such as `config.json`)
804/// are reported as [`VerifyStatus::Skipped`]; files absent from the snapshot
805/// directory are reported as [`VerifyStatus::Missing`]. Both are
806/// non-failures — only [`VerifyStatus::Mismatch`] indicates a corrupted file.
807///
808/// `revision` defaults to `"main"` when `None`. Requires network access for
809/// the metadata fetch; the per-file digest computation is local-only.
810///
811/// For long verifications (multi-GiB safetensors files), prefer
812/// [`verify_cache_with_progress`] so a CLI / GUI can render a spinner or
813/// progress bar while each file is hashed.
814///
815/// # Errors
816///
817/// Returns [`FetchError::Http`] if the `HuggingFace` API request fails.
818/// Returns [`FetchError::Io`] when a local cached file is present but
819/// cannot be read.
820pub async fn verify_cache(
821 repo_id: &str,
822 token: Option<&str>,
823 revision: Option<&str>,
824) -> Result<Vec<FileVerification>, FetchError> {
825 verify_cache_with_progress(repo_id, token, revision, |_| {}).await
826}
827
828/// Same as [`verify_cache`] but emits [`VerifyEvent`]s through `on_event`
829/// so callers can render streaming progress (e.g. a spinner per file).
830///
831/// The callback runs on the same task as the verification — keep it short.
832/// Use interior mutability ([`std::cell::Cell`], [`std::cell::RefCell`]) if
833/// you need to track state across events; the closure may capture by shared
834/// reference because the API requires only [`Fn`].
835///
836/// Files are processed in alphabetical order by filename so that streamed
837/// output remains stable across runs and matches the sort order of the
838/// returned [`Vec<FileVerification>`].
839///
840/// # Errors
841///
842/// Same error conditions as [`verify_cache`].
843pub async fn verify_cache_with_progress<F>(
844 repo_id: &str,
845 token: Option<&str>,
846 revision: Option<&str>,
847 on_event: F,
848) -> Result<Vec<FileVerification>, FetchError>
849where
850 F: Fn(VerifyEvent<'_>),
851{
852 let revision = revision.unwrap_or("main");
853 let cache_dir = hf_cache_dir()?;
854 let repo_dir = crate::cache_layout::repo_dir(&cache_dir, repo_id);
855
856 let commit_hash = read_ref(&repo_dir, revision);
857
858 let client = crate::chunked::build_client(token)?;
859 let mut remote_files =
860 crate::repo::list_repo_files_with_metadata(repo_id, token, Some(revision), &client).await?;
861
862 // Sort up-front so streamed output is stable across runs and matches the
863 // returned `Vec<FileVerification>`'s order.
864 remote_files.sort_by(|a, b| a.filename.cmp(&b.filename));
865
866 // BORROW: explicit .as_deref() for Option<String> → Option<&str>
867 let snapshot_dir = commit_hash
868 .as_deref()
869 .map(|hash| crate::cache_layout::snapshot_dir(&repo_dir, hash));
870
871 let total = remote_files.len();
872 let max_filename_len = remote_files
873 .iter()
874 .map(|f| f.filename.len())
875 .max()
876 .unwrap_or(0);
877
878 on_event(VerifyEvent::Started {
879 total,
880 max_filename_len,
881 });
882
883 let mut results: Vec<FileVerification> = Vec::with_capacity(total);
884
885 for (i, remote) in remote_files.iter().enumerate() {
886 let index = i + 1;
887 let local_path = snapshot_dir
888 .as_ref()
889 // BORROW: explicit .as_str() for path construction
890 .map(|dir| dir.join(remote.filename.as_str()));
891
892 let exists = local_path.as_ref().is_some_and(|p| p.exists());
893 let local_size = local_path
894 .as_ref()
895 .filter(|_| exists)
896 .and_then(|p| std::fs::metadata(p).ok().map(|m| m.len()))
897 .unwrap_or(0);
898 let expected_size = remote.size.unwrap_or(0);
899 let display_size = if exists { local_size } else { expected_size };
900
901 let has_lfs = remote.sha256.is_some();
902 on_event(VerifyEvent::FileStart {
903 index,
904 total,
905 // BORROW: explicit .as_str() for &String → &str argument
906 filename: remote.filename.as_str(),
907 size: display_size,
908 has_lfs,
909 });
910
911 let status = match (remote.sha256.as_deref(), local_path.as_deref(), exists) {
912 (None, _, _) => VerifyStatus::Skipped,
913 (Some(_), None, _) | (Some(_), Some(_), false) => VerifyStatus::Missing,
914 (Some(expected), Some(path), true) => {
915 // BORROW: explicit .as_str() for &String → &str argument
916 match crate::checksum::verify_sha256(path, remote.filename.as_str(), expected).await
917 {
918 Ok(()) => VerifyStatus::Ok,
919 Err(FetchError::Checksum {
920 expected, actual, ..
921 }) => VerifyStatus::Mismatch { expected, actual },
922 Err(e) => return Err(e),
923 }
924 }
925 };
926
927 on_event(VerifyEvent::FileComplete {
928 index,
929 total,
930 // BORROW: explicit .as_str() for &String → &str argument
931 filename: remote.filename.as_str(),
932 size: display_size,
933 status: &status,
934 });
935
936 results.push(FileVerification {
937 // BORROW: explicit .clone() for owned String
938 filename: remote.filename.clone(),
939 size: display_size,
940 status,
941 });
942 }
943
944 Ok(results)
945}
946
947#[cfg(test)]
948mod tests {
949 #![allow(
950 clippy::panic,
951 clippy::unwrap_used,
952 clippy::expect_used,
953 clippy::indexing_slicing
954 )]
955
956 use super::*;
957
958 fn sample_partial(filename: &str) -> PartialFile {
959 PartialFile {
960 repo_id: "org/model".to_owned(),
961 filename: filename.to_owned(),
962 path: PathBuf::from("/tmp/models--org--model/blobs").join(filename),
963 size: 1024,
964 }
965 }
966
967 #[test]
968 fn sidecar_paths_returns_state_and_state_tmp() {
969 let p = sample_partial("abc123.chunked.part");
970 let sidecars = p.sidecar_paths();
971
972 assert_eq!(sidecars.len(), 2);
973 assert_eq!(
974 sidecars[0],
975 PathBuf::from("/tmp/models--org--model/blobs/abc123.chunked.part.state")
976 );
977 assert_eq!(
978 sidecars[1],
979 PathBuf::from("/tmp/models--org--model/blobs/abc123.chunked.part.state.tmp")
980 );
981 }
982
983 #[test]
984 fn sidecar_paths_handles_etag_with_periods() {
985 // Same period-handling rationale as `cache_layout::temp_state_path`:
986 // the etag may itself contain dots, so naive `Path::with_extension`
987 // would chop at the wrong boundary.
988 let p = sample_partial("abc.def.chunked.part");
989 let sidecars = p.sidecar_paths();
990
991 assert_eq!(
992 sidecars[0],
993 PathBuf::from("/tmp/models--org--model/blobs/abc.def.chunked.part.state")
994 );
995 assert_eq!(
996 sidecars[1],
997 PathBuf::from("/tmp/models--org--model/blobs/abc.def.chunked.part.state.tmp")
998 );
999 }
1000}