use super::{CodeStatistics, LanguageStats};
use anyhow::Result;
use std::collections::HashMap;
use std::path::Path;
use tokio::fs;
use tracing::debug;
use walkdir::WalkDir;
pub struct CodeAnalyzer;
impl CodeAnalyzer {
pub fn new() -> Self {
Self
}
pub async fn analyze_directory(&self, path: &Path) -> Result<CodeStatistics> {
let mut stats = CodeStatistics::default();
let mut language_stats: HashMap<String, LanguageStats> = HashMap::new();
let mut file_sizes: Vec<(String, usize)> = Vec::new();
for entry in WalkDir::new(path)
.follow_links(false)
.max_depth(10)
.into_iter()
.filter_entry(|e| !self.should_skip(e.path()))
{
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
if !entry.file_type().is_file() {
continue;
}
let file_path = entry.path();
let relative_path = file_path.strip_prefix(path).unwrap_or(file_path);
let path_str = relative_path.to_string_lossy().to_string();
if self.is_binary_or_generated(file_path) {
continue;
}
let language = self.detect_language(file_path);
match self.count_lines(file_path).await {
Ok((total, code, comments, blank)) => {
stats.total_files += 1;
stats.total_lines += total;
let lang_stat = language_stats.entry(language.clone()).or_insert_with(|| LanguageStats {
file_count: 0,
total_lines: 0,
code_lines: 0,
comment_lines: 0,
blank_lines: 0,
});
lang_stat.file_count += 1;
lang_stat.total_lines += total;
lang_stat.code_lines += code;
lang_stat.comment_lines += comments;
lang_stat.blank_lines += blank;
file_sizes.push((path_str, total));
}
Err(e) => {
debug!("Failed to analyze {}: {}", file_path.display(), e);
}
}
if let Some(ext) = file_path.extension() {
let ext_str = ext.to_string_lossy().to_lowercase();
*stats.file_types.entry(ext_str).or_insert(0) += 1;
}
}
stats.languages = language_stats;
if stats.total_files > 0 {
stats.average_file_size = stats.total_lines / stats.total_files;
}
file_sizes.sort_by(|a, b| b.1.cmp(&a.1));
stats.largest_files = file_sizes.into_iter().take(20).collect();
Ok(stats)
}
fn should_skip(&self, path: &Path) -> bool {
let skip_dirs = [
"node_modules", "target", "build", "dist", ".git",
".github", ".vscode", "vendor", "__pycache__",
".pytest_cache", ".next", "out", "coverage",
".idea", ".vs", "bin", "obj", "Debug", "Release"
];
if let Some(file_name) = path.file_name() {
let name = file_name.to_string_lossy();
if skip_dirs.iter().any(|&d| name == d) {
return false;
}
}
true
}
fn is_binary_or_generated(&self, path: &Path) -> bool {
let binary_exts = [
"exe", "dll", "so", "dylib", "bin", "o", "a", "lib",
"png", "jpg", "jpeg", "gif", "svg", "ico", "woff", "woff2",
"ttf", "eot", "pdf", "zip", "tar", "gz", "rar", "7z",
"mp3", "mp4", "avi", "mov", "webm", "wasm",
"lock", "sum", "min.js", "min.css"
];
if let Some(ext) = path.extension() {
let ext_str = ext.to_string_lossy().to_lowercase();
if binary_exts.contains(&ext_str.as_str()) {
return true;
}
}
let file_name = path.file_name()
.map(|n| n.to_string_lossy().to_lowercase())
.unwrap_or_default();
if file_name.contains(".min.") ||
file_name.contains(".generated.") ||
file_name.starts_with("generated_") ||
file_name.ends_with("_generated") {
return true;
}
false
}
fn detect_language(&self, path: &Path) -> String {
let ext = path.extension()
.map(|e| e.to_string_lossy().to_lowercase())
.unwrap_or_default();
match ext.as_str() {
"rs" => "Rust",
"js" => "JavaScript",
"ts" => "TypeScript",
"jsx" => "JavaScript (JSX)",
"tsx" => "TypeScript (TSX)",
"py" => "Python",
"java" => "Java",
"kt" => "Kotlin",
"go" => "Go",
"rb" => "Ruby",
"php" => "PHP",
"c" => "C",
"cpp" | "cc" | "cxx" => "C++",
"h" | "hpp" => "C/C++ Header",
"cs" => "C#",
"swift" => "Swift",
"scala" => "Scala",
"r" => "R",
"m" => "Objective-C",
"sh" | "bash" => "Shell",
"ps1" => "PowerShell",
"sql" => "SQL",
"html" => "HTML",
"css" => "CSS",
"scss" | "sass" => "SCSS",
"less" => "Less",
"json" => "JSON",
"xml" => "XML",
"yaml" | "yml" => "YAML",
"toml" => "TOML",
"md" | "markdown" => "Markdown",
"dockerfile" => "Dockerfile",
"tf" => "Terraform",
"hs" => "Haskell",
"elm" => "Elm",
"clj" => "Clojure",
"ex" | "exs" => "Elixir",
"erl" => "Erlang",
"lua" => "Lua",
"vim" => "Vim Script",
"dart" => "Dart",
"groovy" => "Groovy",
"pl" | "pm" => "Perl",
"rkt" => "Racket",
"fs" | "fsx" => "F#",
"ml" | "mli" => "OCaml",
"pas" => "Pascal",
"d" => "D",
"nim" => "Nim",
"cr" => "Crystal",
"v" => "V",
"zig" => "Zig",
_ => "Other",
}.to_string()
}
async fn count_lines(&self, path: &Path) -> Result<(usize, usize, usize, usize)> {
let content = fs::read_to_string(path).await?;
let mut total = 0;
let mut code = 0;
let mut comments = 0;
let mut blank = 0;
let lang = self.detect_language(path);
let (single_comment, multi_start, multi_end) = self.get_comment_syntax(&lang);
let mut in_multiline_comment = false;
for line in content.lines() {
total += 1;
let trimmed = line.trim();
if trimmed.is_empty() {
blank += 1;
continue;
}
if let Some(start) = multi_start {
if let Some(end) = multi_end {
if in_multiline_comment {
comments += 1;
if trimmed.contains(end) {
in_multiline_comment = false;
}
continue;
} else if trimmed.starts_with(start) {
comments += 1;
if !trimmed.contains(end) || trimmed.matches(start).count() > trimmed.matches(end).count() {
in_multiline_comment = true;
}
continue;
}
}
}
if let Some(prefix) = single_comment {
if trimmed.starts_with(prefix) {
comments += 1;
continue;
}
}
code += 1;
}
Ok((total, code, comments, blank))
}
fn get_comment_syntax(&self, lang: &str) -> (Option<&str>, Option<&str>, Option<&str>) {
match lang {
"Rust" | "JavaScript" | "TypeScript" | "Java" | "C" | "C++" | "C/C++ Header" | "C#" | "Go" | "Swift" | "Kotlin" | "Scala" | "PHP" | "Dart" | "Groovy" => {
(Some("//"), Some("/*"), Some("*/"))
}
"Python" | "Shell" | "Ruby" | "YAML" | "TOML" | "Perl" | "R" | "PowerShell" | "Dockerfile" => {
(Some("#"), None, None)
}
"HTML" | "XML" => {
(None, Some("<!--"), Some("-->"))
}
"CSS" | "SCSS" | "Less" => {
(None, Some("/*"), Some("*/"))
}
"Haskell" | "Lua" => {
(Some("--"), Some("{-"), Some("-}"))
}
"SQL" => {
(Some("--"), Some("/*"), Some("*/"))
}
_ => (None, None, None),
}
}
}