use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use crate::error::FetchError;
fn repo_id_from_folder_name(dir_name: &str) -> Option<String> {
let repo_part = dir_name.strip_prefix("models--")?;
let repo_id = match repo_part.find("--") {
Some(pos) => {
let (org, name_with_sep) = repo_part.split_at(pos);
let name = name_with_sep.get(2..).unwrap_or_default();
format!("{org}/{name}")
}
None => repo_part.to_string(),
};
Some(repo_id)
}
pub fn hf_cache_dir() -> Result<PathBuf, FetchError> {
if let Ok(home) = std::env::var("HF_HOME") {
let mut path = PathBuf::from(home);
path.push("hub");
return Ok(path);
}
let home = dirs::home_dir().ok_or_else(|| FetchError::Io {
path: PathBuf::from("~"),
source: std::io::Error::new(std::io::ErrorKind::NotFound, "home directory not found"),
})?;
let mut path = home;
path.push(".cache");
path.push("huggingface");
path.push("hub");
Ok(path)
}
pub fn list_cached_families() -> Result<BTreeMap<String, Vec<String>>, FetchError> {
let cache_dir = hf_cache_dir()?;
if !cache_dir.exists() {
return Ok(BTreeMap::new());
}
let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
path: cache_dir.clone(),
source: e,
})?;
let mut families: BTreeMap<String, Vec<String>> = BTreeMap::new();
for entry in entries {
let Ok(entry) = entry else { continue };
let dir_name = entry.file_name();
let dir_str = dir_name.to_string_lossy();
let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
continue;
};
let snapshots_dir = entry.path().join("snapshots");
if !snapshots_dir.exists() {
continue;
}
if let Some(model_type) = find_model_type_in_snapshots(&snapshots_dir) {
families.entry(model_type).or_default().push(repo_id);
}
}
for repos in families.values_mut() {
repos.sort();
}
Ok(families)
}
fn find_model_type_in_snapshots(snapshots_dir: &std::path::Path) -> Option<String> {
let snapshots = std::fs::read_dir(snapshots_dir).ok()?;
for snap_entry in snapshots {
let Ok(snap_entry) = snap_entry else { continue };
let config_path = snap_entry.path().join("config.json");
if !config_path.exists() {
continue;
}
if let Some(model_type) = extract_model_type(&config_path) {
return Some(model_type);
}
}
None
}
fn extract_model_type(config_path: &std::path::Path) -> Option<String> {
let contents = std::fs::read_to_string(config_path).ok()?;
let value: serde_json::Value = serde_json::from_str(contents.as_str()).ok()?;
value.get("model_type")?.as_str().map(String::from)
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum FileStatus {
Complete {
local_size: u64,
},
Partial {
local_size: u64,
expected_size: u64,
},
Missing {
expected_size: u64,
},
}
#[derive(Debug, Clone)]
pub struct RepoStatus {
pub repo_id: String,
pub commit_hash: Option<String>,
pub cache_path: PathBuf,
pub files: Vec<(String, FileStatus)>,
}
impl RepoStatus {
#[must_use]
pub fn complete_count(&self) -> usize {
self.files
.iter()
.filter(|(_, s)| matches!(s, FileStatus::Complete { .. }))
.count()
}
#[must_use]
pub fn partial_count(&self) -> usize {
self.files
.iter()
.filter(|(_, s)| matches!(s, FileStatus::Partial { .. }))
.count()
}
#[must_use]
pub fn missing_count(&self) -> usize {
self.files
.iter()
.filter(|(_, s)| matches!(s, FileStatus::Missing { .. }))
.count()
}
}
pub async fn repo_status(
repo_id: &str,
token: Option<&str>,
revision: Option<&str>,
) -> Result<RepoStatus, FetchError> {
let revision = revision.unwrap_or("main");
let cache_dir = hf_cache_dir()?;
let repo_folder = crate::chunked::repo_folder_name(repo_id);
let repo_dir = cache_dir.join(repo_folder.as_str());
let commit_hash = read_ref(&repo_dir, revision);
let remote_files =
crate::repo::list_repo_files_with_metadata(repo_id, token, Some(revision)).await?;
let snapshot_dir = commit_hash
.as_deref()
.map(|hash| repo_dir.join("snapshots").join(hash));
let blobs_dir = repo_dir.join("blobs");
let has_any_partial = has_partial_blob(&blobs_dir);
let mut files: Vec<(String, FileStatus)> = Vec::with_capacity(remote_files.len());
for remote in &remote_files {
let expected_size = remote.size.unwrap_or(0);
let local_path = snapshot_dir
.as_ref()
.map(|dir| dir.join(remote.filename.as_str()));
let status = if let Some(ref path) = local_path {
if path.exists() {
let local_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
if expected_size > 0 && local_size < expected_size {
FileStatus::Partial {
local_size,
expected_size,
}
} else {
FileStatus::Complete { local_size }
}
} else if has_any_partial {
let part_size = find_partial_blob_size(&blobs_dir);
FileStatus::Partial {
local_size: part_size,
expected_size,
}
} else {
FileStatus::Missing { expected_size }
}
} else {
FileStatus::Missing { expected_size }
};
files.push((remote.filename.clone(), status));
}
files.sort_by(|(a, _), (b, _)| a.cmp(b));
Ok(RepoStatus {
repo_id: repo_id.to_owned(),
commit_hash,
cache_path: repo_dir,
files,
})
}
#[derive(Debug, Clone)]
pub struct CachedModelSummary {
pub repo_id: String,
pub file_count: usize,
pub total_size: u64,
pub has_partial: bool,
pub last_modified: Option<std::time::SystemTime>,
}
pub fn cache_summary() -> Result<Vec<CachedModelSummary>, FetchError> {
let cache_dir = hf_cache_dir()?;
if !cache_dir.exists() {
return Ok(Vec::new());
}
let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
path: cache_dir.clone(),
source: e,
})?;
let mut summaries: Vec<CachedModelSummary> = Vec::new();
for entry in entries {
let Ok(entry) = entry else { continue };
let dir_name = entry.file_name();
let dir_str = dir_name.to_string_lossy();
let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
continue;
};
let repo_dir = entry.path();
let (file_count, total_size, last_modified) = count_snapshot_files(&repo_dir);
let has_partial = find_partial_blob_size(&repo_dir.join("blobs")) > 0;
summaries.push(CachedModelSummary {
repo_id,
file_count,
total_size,
has_partial,
last_modified,
});
}
summaries.sort_by(|a, b| a.repo_id.cmp(&b.repo_id));
Ok(summaries)
}
pub fn repo_disk_usage(repo_id: &str) -> Result<(usize, u64), FetchError> {
let cache_dir = hf_cache_dir()?;
let repo_folder = format!("models--{}", repo_id.replace('/', "--"));
let repo_dir = cache_dir.join(repo_folder.as_str());
let (file_count, total_size, _) = count_snapshot_files(&repo_dir);
Ok((file_count, total_size))
}
pub fn repo_has_partial(repo_id: &str) -> Result<bool, FetchError> {
let cache_dir = hf_cache_dir()?;
let repo_folder = format!("models--{}", repo_id.replace('/', "--"));
let blobs_dir = cache_dir.join(repo_folder.as_str()).join("blobs");
Ok(find_partial_blob_size(&blobs_dir) > 0)
}
fn count_snapshot_files(repo_dir: &Path) -> (usize, u64, Option<std::time::SystemTime>) {
let snapshots_dir = repo_dir.join("snapshots");
let Ok(snapshots) = std::fs::read_dir(snapshots_dir) else {
return (0, 0, None);
};
let mut file_count: usize = 0;
let mut total_size: u64 = 0;
let mut latest: Option<std::time::SystemTime> = None;
for snap_entry in snapshots {
let Ok(snap_entry) = snap_entry else { continue };
let snap_path = snap_entry.path();
if !snap_path.is_dir() {
continue;
}
count_files_recursive(&snap_path, &mut file_count, &mut total_size, &mut latest);
}
(file_count, total_size, latest)
}
fn count_files_recursive(
dir: &Path,
count: &mut usize,
total: &mut u64,
latest: &mut Option<std::time::SystemTime>,
) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries {
let Ok(entry) = entry else { continue };
let path = entry.path();
if path.is_dir() {
count_files_recursive(&path, count, total, latest);
} else if let Ok(meta) = entry.metadata() {
*count += 1;
*total += meta.len();
if let Ok(modified) = meta.modified() {
match *latest {
Some(current) if modified <= current => {} _ => *latest = Some(modified),
}
}
} else {
*count += 1;
}
}
}
#[must_use]
pub fn read_ref(repo_dir: &Path, revision: &str) -> Option<String> {
let ref_path = repo_dir.join("refs").join(revision);
std::fs::read_to_string(ref_path)
.ok()
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
}
fn has_partial_blob(blobs_dir: &Path) -> bool {
find_partial_blob_size(blobs_dir) > 0
}
fn find_partial_blob_size(blobs_dir: &Path) -> u64 {
let Ok(entries) = std::fs::read_dir(blobs_dir) else {
return 0;
};
for entry in entries {
let Ok(entry) = entry else { continue };
let name = entry.file_name();
if name.to_string_lossy().ends_with(".chunked.part") {
return entry.metadata().map(|m| m.len()).unwrap_or(0);
}
}
0
}
#[derive(Debug, Clone)]
pub struct PartialFile {
pub repo_id: String,
pub filename: String,
pub path: PathBuf,
pub size: u64,
}
pub fn find_partial_files(repo_filter: Option<&str>) -> Result<Vec<PartialFile>, FetchError> {
let cache_dir = hf_cache_dir()?;
if !cache_dir.exists() {
return Ok(Vec::new());
}
let entries = std::fs::read_dir(&cache_dir).map_err(|e| FetchError::Io {
path: cache_dir.clone(),
source: e,
})?;
let mut partials: Vec<PartialFile> = Vec::new();
for entry in entries {
let Ok(entry) = entry else { continue };
let dir_name = entry.file_name();
let dir_str = dir_name.to_string_lossy();
let Some(repo_id) = repo_id_from_folder_name(&dir_str) else {
continue;
};
if let Some(filter) = repo_filter {
if repo_id.as_str() != filter {
continue;
}
}
let blobs_dir = entry.path().join("blobs");
let Ok(blob_entries) = std::fs::read_dir(&blobs_dir) else {
continue;
};
for blob_entry in blob_entries {
let Ok(blob_entry) = blob_entry else { continue };
let name = blob_entry.file_name();
let name_str = name.to_string_lossy();
if name_str.ends_with(".chunked.part") {
let size = blob_entry.metadata().map(|m| m.len()).unwrap_or(0);
partials.push(PartialFile {
repo_id: repo_id.clone(),
filename: name_str.to_string(),
path: blob_entry.path(),
size,
});
}
}
}
Ok(partials)
}
#[derive(Debug, Clone)]
pub struct CacheFileUsage {
pub filename: String,
pub size: u64,
}
pub fn cache_repo_usage(repo_id: &str) -> Result<Vec<CacheFileUsage>, FetchError> {
let cache_dir = hf_cache_dir()?;
let repo_folder = crate::chunked::repo_folder_name(repo_id);
let repo_dir = cache_dir.join(repo_folder.as_str());
if !repo_dir.exists() {
return Ok(Vec::new());
}
let snapshots_dir = repo_dir.join("snapshots");
let Ok(snapshots) = std::fs::read_dir(&snapshots_dir) else {
return Ok(Vec::new());
};
let mut files: Vec<CacheFileUsage> = Vec::new();
for snap_entry in snapshots {
let Ok(snap_entry) = snap_entry else { continue };
let snap_path = snap_entry.path();
if !snap_path.is_dir() {
continue;
}
collect_snapshot_files(&snap_path, "", &mut files);
}
files.sort_by(|a, b| b.size.cmp(&a.size));
Ok(files)
}
fn collect_snapshot_files(dir: &Path, prefix: &str, files: &mut Vec<CacheFileUsage>) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries {
let Ok(entry) = entry else { continue };
let path = entry.path();
let name = entry.file_name().to_string_lossy().to_string();
if path.is_dir() {
let child_prefix = if prefix.is_empty() {
name
} else {
format!("{prefix}/{name}")
};
collect_snapshot_files(&path, &child_prefix, files);
} else {
let filename = if prefix.is_empty() {
name
} else {
format!("{prefix}/{name}")
};
let size = entry.metadata().map(|m| m.len()).unwrap_or(0);
files.push(CacheFileUsage { filename, size });
}
}
}