use serde::Serialize;
use std::fmt;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Tier {
Core,
Extended,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, clap::ValueEnum)]
#[serde(rename_all = "snake_case")]
#[clap(rename_all = "snake_case")]
pub enum Category {
Cache,
Build,
Log,
Media,
Vcs,
Ide,
Other,
Archive,
Installer,
VmImage,
ModelCache,
Backup,
}
impl Category {
pub fn label(&self) -> &'static str {
match self {
Category::Cache => "cache",
Category::Build => "build",
Category::Log => "log",
Category::Media => "media",
Category::Vcs => "vcs",
Category::Ide => "ide",
Category::Other => "other",
Category::Archive => "archive",
Category::Installer => "installer",
Category::VmImage => "vm_image",
Category::ModelCache => "model_cache",
Category::Backup => "backup",
}
}
pub fn tier(&self) -> Tier {
match self {
Category::Cache
| Category::Build
| Category::Log
| Category::Media
| Category::Vcs
| Category::Ide
| Category::Other => Tier::Core,
Category::Archive
| Category::Installer
| Category::VmImage
| Category::ModelCache
| Category::Backup => Tier::Extended,
}
}
}
impl fmt::Display for Category {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.label())
}
}
const MODEL_CACHE_DIRS: &[&str] = &[".ollama", ".lmstudio", ".huggingface"];
const BACKUP_DIRS: &[&str] = &["time machine backups", "backups.backupdb"];
const CACHE_DIRS: &[&str] = &[
"node_modules",
".cache",
"__pycache__",
".npm",
".yarn",
".pnpm-store",
"caches",
".gradle",
".nuget",
".pub-cache",
"pods",
".cocoapods",
".cargo",
"bower_components",
".tmp",
"tmp",
"temp",
".temp",
".trash",
".rustup",
".pyenv",
".rbenv",
".nvm",
".volta",
".asdf",
"mise",
".pipx",
"pipx",
".poetry",
".composer",
".m2",
".ivy2",
".sbt",
".stack",
".cabal",
".deno",
".bun",
".docker",
"vm_bundles",
];
const BUILD_DIRS: &[&str] = &[
"target",
"dist",
"build",
"out",
".next",
".nuxt",
".output",
".turbo",
".angular",
"_build",
"cmake-build-debug",
"cmake-build-release",
];
const LOG_DIRS: &[&str] = &["logs", "log", ".logs"];
const VCS_DIRS: &[&str] = &[".git", ".svn", ".hg", ".jj", ".bzr", "_darcs", ".fossil"];
const IDE_DIRS: &[&str] = &[
".idea",
".vscode",
".vscode-insiders",
".vscode-server",
".vs",
".eclipse",
".settings",
".cursor",
".cursor-server",
".windsurf",
".zed",
".fleet",
];
const VM_IMAGE_FILE_SUFFIXES: &[&str] = &["data.img.raw"];
const VM_IMAGE_EXTENSIONS: &[&str] = &[
".vdi", ".vmdk", ".qcow2", ".vhd", ".vhdx", ".iso",
];
const INSTALLER_EXTENSIONS: &[&str] = &[
".dmg", ".pkg", ".msi", ".exe", ".deb", ".rpm", ".appimage", ".snap", ".flatpak", ".apk", ];
const ARCHIVE_EXTENSIONS: &[&str] = &[
".zip", ".tar", ".tgz", ".tbz2", ".txz", ".gz", ".bz2", ".xz", ".7z", ".rar", ".zst",
];
const BACKUP_EXTENSIONS: &[&str] = &[".bak", ".backup", ".old"];
#[rustfmt::skip]
const MEDIA_EXTENSIONS: &[&str] = &[
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp", ".ico", ".tiff", ".heic", ".heif",
".psd", ".raw", ".arw", ".cr2", ".nef", ".dng",
".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".m4v", ".3gp",
".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a", ".opus", ".aiff",
];
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum ClassificationReason {
DirNameExact { needle: &'static str },
DirNameContainsCache,
FileNameSuffix { needle: &'static str },
FileExtension { needle: &'static str },
Default,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct Classification {
pub category: Category,
pub reason: ClassificationReason,
}
impl ClassificationReason {
pub fn describe(&self) -> String {
match self {
ClassificationReason::DirNameExact { needle } => {
format!("matched directory rule: {needle}")
}
ClassificationReason::DirNameContainsCache => {
"directory name contains \"cache\"".to_string()
}
ClassificationReason::FileNameSuffix { needle } => {
format!("matched filename suffix: {needle}")
}
ClassificationReason::FileExtension { needle } => {
format!("matched file extension: {needle}")
}
ClassificationReason::Default => "no rule matched; defaulted to other".to_string(),
}
}
}
pub fn explain_dir(name: &str) -> Classification {
let lower = name.to_lowercase();
if let Some(needle) = first_exact_match(&lower, MODEL_CACHE_DIRS) {
return Classification {
category: Category::ModelCache,
reason: ClassificationReason::DirNameExact { needle },
};
}
if let Some(needle) = first_exact_match(&lower, BACKUP_DIRS) {
return Classification {
category: Category::Backup,
reason: ClassificationReason::DirNameExact { needle },
};
}
if let Some(needle) = first_exact_match(&lower, CACHE_DIRS) {
return Classification {
category: Category::Cache,
reason: ClassificationReason::DirNameExact { needle },
};
}
if lower.contains("cache") {
return Classification {
category: Category::Cache,
reason: ClassificationReason::DirNameContainsCache,
};
}
if let Some(needle) = first_exact_match(&lower, BUILD_DIRS) {
return Classification {
category: Category::Build,
reason: ClassificationReason::DirNameExact { needle },
};
}
if let Some(needle) = first_exact_match(&lower, LOG_DIRS) {
return Classification {
category: Category::Log,
reason: ClassificationReason::DirNameExact { needle },
};
}
if let Some(needle) = first_exact_match(&lower, VCS_DIRS) {
return Classification {
category: Category::Vcs,
reason: ClassificationReason::DirNameExact { needle },
};
}
if let Some(needle) = first_exact_match(&lower, IDE_DIRS) {
return Classification {
category: Category::Ide,
reason: ClassificationReason::DirNameExact { needle },
};
}
Classification {
category: Category::Other,
reason: ClassificationReason::Default,
}
}
pub fn explain_file(name: &str) -> Classification {
let lower = name.to_lowercase();
if let Some(needle) = first_suffix_match(&lower, &[".log"]) {
return Classification {
category: Category::Log,
reason: ClassificationReason::FileExtension { needle },
};
}
if let Some(needle) = first_suffix_match(&lower, VM_IMAGE_FILE_SUFFIXES) {
return Classification {
category: Category::VmImage,
reason: ClassificationReason::FileNameSuffix { needle },
};
}
if let Some(needle) = first_suffix_match(&lower, VM_IMAGE_EXTENSIONS) {
return Classification {
category: Category::VmImage,
reason: ClassificationReason::FileExtension { needle },
};
}
if let Some(needle) = first_suffix_match(&lower, INSTALLER_EXTENSIONS) {
return Classification {
category: Category::Installer,
reason: ClassificationReason::FileExtension { needle },
};
}
if let Some(needle) = first_suffix_match(&lower, ARCHIVE_EXTENSIONS) {
return Classification {
category: Category::Archive,
reason: ClassificationReason::FileExtension { needle },
};
}
if let Some(needle) = first_suffix_match(&lower, BACKUP_EXTENSIONS) {
return Classification {
category: Category::Backup,
reason: ClassificationReason::FileExtension { needle },
};
}
if let Some(needle) = first_suffix_match(&lower, MEDIA_EXTENSIONS) {
return Classification {
category: Category::Media,
reason: ClassificationReason::FileExtension { needle },
};
}
Classification {
category: Category::Other,
reason: ClassificationReason::Default,
}
}
fn first_exact_match(lower: &str, needles: &[&'static str]) -> Option<&'static str> {
needles.iter().copied().find(|n| *n == lower)
}
fn first_suffix_match(lower: &str, needles: &[&'static str]) -> Option<&'static str> {
needles.iter().copied().find(|n| lower.ends_with(*n))
}
pub fn classify_dir(name: &str) -> Category {
explain_dir(name).category
}
pub fn classify_file(name: &str) -> Category {
explain_file(name).category
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn classifies_known_directories() {
assert_eq!(classify_dir("node_modules"), Category::Cache);
assert_eq!(classify_dir("__pycache__"), Category::Cache);
assert_eq!(classify_dir("target"), Category::Build);
assert_eq!(classify_dir("dist"), Category::Build);
assert_eq!(classify_dir("logs"), Category::Log);
assert_eq!(classify_dir(".git"), Category::Vcs);
assert_eq!(classify_dir(".idea"), Category::Ide);
assert_eq!(classify_dir("src"), Category::Other);
}
#[test]
fn classifies_directory_names_case_insensitively() {
assert_eq!(classify_dir("Node_Modules"), Category::Cache);
assert_eq!(classify_dir(".GIT"), Category::Vcs);
}
#[test]
fn classifies_language_toolchains_as_cache() {
assert_eq!(classify_dir(".rustup"), Category::Cache);
assert_eq!(classify_dir(".pyenv"), Category::Cache);
assert_eq!(classify_dir(".nvm"), Category::Cache);
assert_eq!(classify_dir("mise"), Category::Cache);
assert_eq!(classify_dir("pipx"), Category::Cache);
assert_eq!(classify_dir(".docker"), Category::Cache);
assert_eq!(classify_dir("vm_bundles"), Category::Cache);
}
#[test]
fn classifies_additional_ide_and_vcs() {
assert_eq!(classify_dir(".vscode-insiders"), Category::Ide);
assert_eq!(classify_dir(".cursor"), Category::Ide);
assert_eq!(classify_dir(".zed"), Category::Ide);
assert_eq!(classify_dir(".jj"), Category::Vcs);
}
#[test]
fn partial_match_catches_cache_directories() {
assert_eq!(classify_dir("GPUCache"), Category::Cache);
assert_eq!(classify_dir("Code Cache"), Category::Cache);
}
#[test]
fn classifies_files_by_extension() {
assert_eq!(classify_file("debug.log"), Category::Log);
assert_eq!(classify_file("photo.JPG"), Category::Media);
assert_eq!(classify_file("video.mp4"), Category::Media);
assert_eq!(classify_file("song.mp3"), Category::Media);
assert_eq!(classify_file("main.rs"), Category::Other);
}
#[test]
fn typescript_files_are_not_media() {
assert_eq!(classify_file("index.ts"), Category::Other);
assert_eq!(classify_file("App.tsx"), Category::Other);
assert_eq!(classify_file("eleventy.config.ts"), Category::Other);
}
#[test]
fn ai_model_stores_classify_as_model_cache() {
assert_eq!(classify_dir(".ollama"), Category::ModelCache);
assert_eq!(classify_dir(".lmstudio"), Category::ModelCache);
assert_eq!(classify_dir(".huggingface"), Category::ModelCache);
}
#[test]
fn time_machine_backup_directories_classify_as_backup() {
assert_eq!(classify_dir("Time Machine Backups"), Category::Backup);
assert_eq!(classify_dir("Backups.backupdb"), Category::Backup);
}
#[test]
fn installer_files_classify_as_installer() {
assert_eq!(classify_file("Codex.dmg"), Category::Installer);
assert_eq!(classify_file("googlechrome.dmg"), Category::Installer);
assert_eq!(classify_file("setup.exe"), Category::Installer);
assert_eq!(classify_file("package.deb"), Category::Installer);
assert_eq!(classify_file("MyApp.AppImage"), Category::Installer);
}
#[test]
fn vm_images_classify_as_vm_image() {
assert_eq!(classify_file("disk.vdi"), Category::VmImage);
assert_eq!(classify_file("disk.vmdk"), Category::VmImage);
assert_eq!(classify_file("disk.qcow2"), Category::VmImage);
assert_eq!(classify_file("data.img.raw"), Category::VmImage);
}
#[test]
fn raw_photo_classifies_as_media() {
assert_eq!(classify_file("DSC0001.raw"), Category::Media);
assert_eq!(classify_file("DSC0001.arw"), Category::Media);
assert_eq!(classify_file("data.img.raw"), Category::VmImage);
}
#[test]
fn archive_files_classify_as_archive() {
assert_eq!(classify_file("snapshot.zip"), Category::Archive);
assert_eq!(classify_file("source.tar.gz"), Category::Archive);
assert_eq!(classify_file("source.tgz"), Category::Archive);
assert_eq!(classify_file("blob.7z"), Category::Archive);
assert_eq!(classify_file("data.zst"), Category::Archive);
}
#[test]
fn backup_files_classify_as_backup() {
assert_eq!(classify_file("config.bak"), Category::Backup);
assert_eq!(classify_file("notes.old"), Category::Backup);
}
#[test]
fn tier_split_matches_intent() {
for c in [
Category::Cache,
Category::Build,
Category::Log,
Category::Media,
Category::Vcs,
Category::Ide,
Category::Other,
] {
assert_eq!(c.tier(), Tier::Core, "{c:?} should be Core");
}
for c in [
Category::Archive,
Category::Installer,
Category::VmImage,
Category::ModelCache,
Category::Backup,
] {
assert_eq!(c.tier(), Tier::Extended, "{c:?} should be Extended");
}
}
#[test]
fn explain_dir_reports_exact_match_needle() {
let c = explain_dir("node_modules");
assert_eq!(c.category, Category::Cache);
assert_eq!(
c.reason,
ClassificationReason::DirNameExact {
needle: "node_modules"
}
);
}
#[test]
fn explain_dir_reports_extended_winning_over_cache() {
let c = explain_dir(".ollama");
assert_eq!(c.category, Category::ModelCache);
assert_eq!(
c.reason,
ClassificationReason::DirNameExact { needle: ".ollama" }
);
}
#[test]
fn explain_dir_reports_partial_cache_match() {
let c = explain_dir("GPUCache");
assert_eq!(c.category, Category::Cache);
assert_eq!(c.reason, ClassificationReason::DirNameContainsCache);
}
#[test]
fn explain_dir_reports_default_when_unmatched() {
let c = explain_dir("src");
assert_eq!(c.category, Category::Other);
assert_eq!(c.reason, ClassificationReason::Default);
}
#[test]
fn explain_file_reports_extension_needle() {
let c = explain_file("debug.log");
assert_eq!(c.category, Category::Log);
assert_eq!(
c.reason,
ClassificationReason::FileExtension { needle: ".log" }
);
let c = explain_file("source.tar.gz");
assert_eq!(c.category, Category::Archive);
assert_eq!(
c.reason,
ClassificationReason::FileExtension { needle: ".gz" }
);
}
#[test]
fn explain_file_reports_filename_suffix_for_orbstack() {
let c = explain_file("data.img.raw");
assert_eq!(c.category, Category::VmImage);
assert_eq!(
c.reason,
ClassificationReason::FileNameSuffix {
needle: "data.img.raw"
}
);
}
#[test]
fn explain_file_reports_default_when_unmatched() {
let c = explain_file("main.rs");
assert_eq!(c.category, Category::Other);
assert_eq!(c.reason, ClassificationReason::Default);
}
}