use crate::{walk_with_options, WalkOptions};
use polars::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutlierOptions {
pub min_size: Option<u64>,
pub top_n: Option<usize>,
pub std_dev_threshold: f64,
pub check_hidden_consumers: bool,
pub include_empty_dirs: bool,
pub check_patterns: bool,
pub enable_clustering: bool,
pub cluster_similarity_threshold: u8,
pub min_cluster_size: usize,
}
impl Default for OutlierOptions {
fn default() -> Self {
Self {
min_size: None,
top_n: Some(20),
std_dev_threshold: 2.0,
check_hidden_consumers: true,
include_empty_dirs: false,
check_patterns: true,
enable_clustering: false,
cluster_similarity_threshold: 70,
min_cluster_size: 2,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LargeFileOutlier {
pub path: PathBuf,
pub size_bytes: u64,
pub size_mb: f64,
pub percentage_of_total: f64,
pub std_devs_from_mean: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HiddenConsumer {
pub path: PathBuf,
pub pattern_type: String,
pub total_size_bytes: u64,
pub file_count: usize,
pub recommendation: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PatternGroup {
pub pattern: String,
pub count: usize,
pub total_size_bytes: u64,
pub sample_files: Vec<PathBuf>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutlierReport {
pub large_files: Vec<LargeFileOutlier>,
pub hidden_consumers: Vec<HiddenConsumer>,
pub pattern_groups: Vec<PatternGroup>,
pub large_file_clusters: Vec<crate::clustering::LargeFileCluster>,
pub total_size_analyzed: u64,
pub total_files_analyzed: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SimpleFileInfo {
pub path: PathBuf,
pub size_bytes: u64,
pub ssdeep_hash: Option<String>,
}
const HIDDEN_CONSUMER_PATTERNS: &[(&str, &str, &str)] = &[
("node_modules", "Node.js dependencies", "Consider using npm prune or clearing unused dependencies"),
(".git", "Git repository data", "Run git gc to clean up unnecessary files"),
("target", "Rust build artifacts", "Run cargo clean to remove build artifacts"),
("build", "Build output directory", "Clean build artifacts if not needed"),
("dist", "Distribution files", "Remove old distribution builds"),
(".venv", "Python virtual environment", "Recreate virtual environment if needed"),
("__pycache__", "Python cache files", "Safe to delete, will be regenerated"),
(".cache", "Application cache", "Review and clean old cache files"),
("tmp", "Temporary files", "Clean up old temporary files"),
("logs", "Log files", "Archive or delete old logs"),
];
pub fn detect_outliers(path: &str, options: &OutlierOptions) -> Result<OutlierReport, Box<dyn std::error::Error>> {
let walk_options = WalkOptions::default();
let files = walk_with_options(path, &walk_options)?;
if files.is_empty() {
return Ok(OutlierReport {
large_files: vec![],
hidden_consumers: vec![],
pattern_groups: vec![],
large_file_clusters: vec![],
total_size_analyzed: 0,
total_files_analyzed: 0,
});
}
let file_infos: Vec<SimpleFileInfo> = files
.iter()
.filter_map(|path_str| {
let path = Path::new(path_str);
fs::metadata(path).ok().map(|metadata| {
let ssdeep_hash = if options.enable_clustering && metadata.len() >= 1024 * 1024 {
fs::read(path).ok().and_then(|content| ssdeep::hash(&content).ok())
} else {
None
};
SimpleFileInfo {
path: path.to_path_buf(),
size_bytes: metadata.len(),
ssdeep_hash,
}
})
})
.collect();
let total_size: u64 = file_infos.iter().map(|f| f.size_bytes).sum();
let total_files = file_infos.len();
let large_files = detect_large_file_outliers(&file_infos, total_size, options);
let hidden_consumers = if options.check_hidden_consumers {
detect_hidden_consumers(&files, &file_infos)
} else {
vec![]
};
let pattern_groups = if options.check_patterns {
detect_pattern_groups(&file_infos)
} else {
vec![]
};
let large_file_clusters = if options.enable_clustering {
let large_files_for_clustering: Vec<SimpleFileInfo> = file_infos
.iter()
.filter(|f| f.ssdeep_hash.is_some() && f.size_bytes >= options.min_size.unwrap_or(1024 * 1024))
.cloned()
.collect();
crate::clustering::detect_large_file_clusters(
&large_files_for_clustering,
options.cluster_similarity_threshold,
options.min_cluster_size,
).unwrap_or_else(|_| vec![])
} else {
vec![]
};
Ok(OutlierReport {
large_files,
hidden_consumers,
pattern_groups,
large_file_clusters,
total_size_analyzed: total_size,
total_files_analyzed: total_files,
})
}
fn detect_large_file_outliers(
files: &[SimpleFileInfo],
total_size: u64,
options: &OutlierOptions,
) -> Vec<LargeFileOutlier> {
if files.is_empty() {
return vec![];
}
let sizes: Vec<f64> = files.iter().map(|f| f.size_bytes as f64).collect();
let mean = sizes.iter().sum::<f64>() / sizes.len() as f64;
let variance = sizes.iter()
.map(|size| {
let diff = size - mean;
diff * diff
})
.sum::<f64>() / sizes.len() as f64;
let std_dev = variance.sqrt();
let mut outliers: Vec<LargeFileOutlier> = files
.iter()
.filter_map(|f| {
if let Some(min_size) = options.min_size {
if f.size_bytes < min_size {
return None;
}
}
let z_score = if std_dev > 0.0 {
(f.size_bytes as f64 - mean) / std_dev
} else {
0.0
};
if z_score > options.std_dev_threshold {
let outlier = LargeFileOutlier {
path: f.path.clone(),
size_bytes: f.size_bytes,
size_mb: f.size_bytes as f64 / (1024.0 * 1024.0),
percentage_of_total: (f.size_bytes as f64 / total_size as f64) * 100.0,
std_devs_from_mean: z_score,
};
Some(outlier)
} else {
None
}
})
.collect();
outliers.sort_by(|a, b| b.size_bytes.cmp(&a.size_bytes));
if let Some(top_n) = options.top_n {
outliers.truncate(top_n);
}
outliers
}
fn detect_hidden_consumers(paths: &[String], file_infos: &[SimpleFileInfo]) -> Vec<HiddenConsumer> {
let mut consumers = Vec::new();
let mut path_to_info: HashMap<&Path, &SimpleFileInfo> = HashMap::new();
for info in file_infos {
path_to_info.insert(&info.path, info);
}
let mut dir_contents: HashMap<PathBuf, Vec<&str>> = HashMap::new();
for path in paths {
if let Some(parent) = Path::new(path).parent() {
dir_contents.entry(parent.to_path_buf())
.or_default()
.push(path);
}
}
for (dir, contents) in dir_contents {
for &(pattern, description, recommendation) in HIDDEN_CONSUMER_PATTERNS {
let dir_name = dir.file_name()
.and_then(|n| n.to_str())
.unwrap_or("");
if dir_name == pattern || dir.ends_with(pattern) {
let mut total_size = 0u64;
let mut file_count = 0;
for path_str in &contents {
let path = Path::new(path_str);
if path.starts_with(&dir) {
if let Some(info) = path_to_info.get(path) {
total_size += info.size_bytes;
file_count += 1;
}
}
}
if total_size > 0 {
consumers.push(HiddenConsumer {
path: dir.clone(),
pattern_type: description.to_string(),
total_size_bytes: total_size,
file_count,
recommendation: recommendation.to_string(),
});
}
break;
}
}
}
consumers.sort_by(|a, b| b.total_size_bytes.cmp(&a.total_size_bytes));
consumers
}
fn detect_pattern_groups(files: &[SimpleFileInfo]) -> Vec<PatternGroup> {
let mut pattern_map: HashMap<String, Vec<&SimpleFileInfo>> = HashMap::new();
for file in files {
if let Some(file_name) = file.path.file_name() {
let file_name_str = file_name.to_string_lossy();
if let Some((prefix, suffix)) = detect_numbered_pattern(&file_name_str) {
let pattern = format!("{}*{}", prefix, suffix);
pattern_map.entry(pattern).or_default().push(file);
}
else if let Some((prefix, suffix)) = detect_dated_pattern(&file_name_str) {
let pattern = format!("{}*{}", prefix, suffix);
pattern_map.entry(pattern).or_default().push(file);
}
}
}
let mut groups: Vec<PatternGroup> = pattern_map
.into_iter()
.filter(|(_, files)| files.len() >= 3) .map(|(pattern, files)| {
let total_size: u64 = files.iter().map(|f| f.size_bytes).sum();
let sample_files: Vec<PathBuf> = files.iter()
.take(5)
.map(|f| f.path.clone())
.collect();
PatternGroup {
pattern,
count: files.len(),
total_size_bytes: total_size,
sample_files,
}
})
.collect();
groups.sort_by(|a, b| b.total_size_bytes.cmp(&a.total_size_bytes));
groups
}
fn detect_numbered_pattern(filename: &str) -> Option<(&str, &str)> {
let re = regex::Regex::new(r"^(.+?)[-_]?(\d{2,})(\..+)?$").ok()?;
if let Some(captures) = re.captures(filename) {
let prefix = captures.get(1)?.as_str();
let suffix = captures.get(3).map_or("", |m| m.as_str());
Some((prefix, suffix))
} else {
None
}
}
fn detect_dated_pattern(filename: &str) -> Option<(&str, &str)> {
let re = regex::Regex::new(r"^(.+?)[-_]?(\d{4}[-_]?\d{2}[-_]?\d{2})(\..+)?$").ok()?;
if let Some(captures) = re.captures(filename) {
let prefix = captures.get(1)?.as_str();
let suffix = captures.get(3).map_or("", |m| m.as_str());
Some((prefix, suffix))
} else {
None
}
}
pub fn outliers_to_dataframe(report: &OutlierReport) -> Result<DataFrame, PolarsError> {
let file_paths: Vec<String> = report.large_files.iter()
.map(|f| f.path.to_string_lossy().to_string())
.collect();
let size_mb: Vec<f64> = report.large_files.iter()
.map(|f| f.size_mb)
.collect();
let percentage: Vec<f64> = report.large_files.iter()
.map(|f| f.percentage_of_total)
.collect();
let std_devs: Vec<f64> = report.large_files.iter()
.map(|f| f.std_devs_from_mean)
.collect();
let df = DataFrame::new(vec![
Series::new("file_path", file_paths),
Series::new("size_mb", size_mb),
Series::new("percentage_of_total", percentage),
Series::new("std_devs_from_mean", std_devs),
])?;
Ok(df)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_numbered_pattern() {
assert_eq!(detect_numbered_pattern("backup-001.tar"), Some(("backup", ".tar")));
assert_eq!(detect_numbered_pattern("file_123.log"), Some(("file", ".log")));
assert_eq!(detect_numbered_pattern("test123"), Some(("test", "")));
assert_eq!(detect_numbered_pattern("no-numbers.txt"), None);
}
#[test]
fn test_detect_dated_pattern() {
assert_eq!(detect_dated_pattern("log-2024-01-01.txt"), Some(("log", ".txt")));
assert_eq!(detect_dated_pattern("backup_2024_12_31.tar"), Some(("backup", ".tar")));
assert_eq!(detect_dated_pattern("report-2024-01-01"), Some(("report", "")));
assert_eq!(detect_dated_pattern("no-date.txt"), None);
}
}