use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::{Context, Result};
use chrono::{DateTime, TimeZone, Utc};
use rayon::prelude::*;
use crate::cli::Args;
use crate::extractors;
use crate::language::{BINARY_EXTENSIONS, EXCLUDED_DIRS};
use crate::locignore::LocIgnore;
use crate::models::{Breakdown, FileInfo, ScanResult};
#[derive(Clone)]
pub struct ScanConfig {
pub target_dir: PathBuf,
pub allowed_extensions: Option<HashSet<String>>,
pub warn_size: Option<usize>,
pub use_git_dates: bool,
pub parallel: bool,
pub extract_functions: bool,
pub is_git_repo: bool,
pub locignore: LocIgnore,
pub include_hidden: bool,
pub git_dates_cache: Option<Arc<HashMap<PathBuf, DateTime<Utc>>>>,
}
impl ScanConfig {
pub fn from_args(args: &Args) -> Result<Self> {
let target_dir = Path::new(&args.directory)
.canonicalize()
.with_context(|| format!("Cannot resolve directory: {}", args.directory))?;
if !target_dir.is_dir() {
anyhow::bail!("Not a directory: {}", target_dir.display());
}
let is_git_repo = check_git_repo(&target_dir);
let global_config = crate::config::GlobalConfig::load();
let mut types_to_use = args.file_types.clone();
if types_to_use.is_empty()
&& let Some(ref default_types) = global_config.default_types
{
types_to_use = default_types.clone();
}
let allowed_extensions = if types_to_use.is_empty() {
None
} else {
let mut exts = HashSet::new();
for lang in &types_to_use {
let resolved = crate::language::resolve_extensions(lang);
if resolved.is_empty()
|| (resolved.len() == 1 && resolved[0] == format!(".{}", lang))
{
eprintln!("[WARNING] Unknown language filter: {}", lang);
}
exts.extend(resolved);
}
Some(exts)
};
let locignore = LocIgnore::build(&target_dir);
let warn_size = args.warn_size.or(global_config.warn_size);
let extract_functions = args.functions
|| args.func_analysis
|| global_config.always_extract_functions.unwrap_or(false);
Ok(Self {
target_dir,
allowed_extensions,
warn_size,
use_git_dates: args.git_dates,
parallel: !args.no_parallel,
extract_functions,
is_git_repo,
locignore,
include_hidden: args.include_hidden,
git_dates_cache: None,
})
}
}
pub fn run_scan(config: &ScanConfig) -> Result<ScanResult> {
let files = if config.is_git_repo && !config.include_hidden {
get_git_files(&config.target_dir, &config.locignore)
} else {
get_manual_files(
&config.target_dir,
&config.locignore,
config.include_hidden,
)
};
let git_dates_cache: Option<Arc<HashMap<PathBuf, DateTime<Utc>>>> =
if config.use_git_dates && config.is_git_repo {
Some(Arc::new(get_all_git_dates(&config.target_dir)))
} else {
None
};
let mut runner_config = config.clone();
runner_config.git_dates_cache = git_dates_cache;
let mut file_infos: Vec<FileInfo> = if runner_config.parallel && files.len() > 50 {
files
.par_iter()
.filter_map(|path| match process_file(path, &runner_config) {
Ok(opt) => opt,
Err(e) => {
eprintln!("[WARN] Skipped {}: {}", path.display(), e);
None
}
})
.collect()
} else {
files
.iter()
.filter_map(|path| match process_file(path, &runner_config) {
Ok(opt) => opt,
Err(e) => {
eprintln!("[WARN] Skipped {}: {}", path.display(), e);
None
}
})
.collect()
};
file_infos.sort_by(|a, b| a.path.cmp(&b.path));
let mut breakdown: Breakdown = std::collections::HashMap::new();
for fi in &file_infos {
if fi.is_binary || fi.is_lockfile {
continue;
}
let ext = if fi.extension().is_empty() {
fi.path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("(no ext)")
.to_string()
} else {
fi.extension().to_string()
};
let stats = breakdown.entry(ext).or_default();
stats.lines += fi.lines;
stats.code += fi.code;
stats.comment += fi.comment;
stats.blank += fi.blank;
stats.files += 1;
stats.functions += fi.function_count();
}
Ok(ScanResult {
files: file_infos,
breakdown,
})
}
fn process_file(path: &Path, config: &ScanConfig) -> Result<Option<FileInfo>> {
if !path.is_file() {
return Ok(None);
}
if crate::language::is_lockfile(path) {
let last_modified = if config.use_git_dates {
if let Some(ref cache) = config.git_dates_cache {
cache.get(path).copied()
} else {
get_fs_last_modified(path)
}
} else {
None
};
return Ok(Some(
FileInfo::new(path.to_path_buf(), 0, 0, 0, 0, false, last_modified)
.mark_as_lockfile(),
));
}
if let Some(allowed) = &config.allowed_extensions {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e.to_lowercase()))
.unwrap_or_default();
if !allowed.contains(&ext) {
return Ok(None);
}
}
let is_binary = is_binary_file(path);
if is_binary && config.allowed_extensions.is_some() {
return Ok(None);
}
let content: Option<String> = if !is_binary {
match std::fs::read_to_string(path) {
Ok(s) => Some(s),
Err(e) => {
return Err(anyhow::anyhow!("read error: {}", e));
}
}
} else {
None
};
let (total, code, comment, blank) = match &content {
Some(s) => analyze_content(s, path),
None => (0, 0, 0, 0),
};
let last_modified: Option<DateTime<Utc>> = if config.use_git_dates {
if let Some(ref cache) = config.git_dates_cache {
cache.get(path).copied()
} else {
get_fs_last_modified(path)
}
} else {
None
};
let mut fi = FileInfo::new(
path.to_path_buf(),
total,
code,
comment,
blank,
is_binary,
last_modified,
);
if config.extract_functions
&& !is_binary
&& let Some(ref s) = content
{
if let Some(extractor) = extractors::get_extractor(path) {
fi = fi.with_functions(extractor.extract(s));
}
}
Ok(Some(fi))
}
fn analyze_content(content: &str, path: &Path) -> (usize, usize, usize, usize) {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e.to_lowercase()))
.unwrap_or_default();
let spec = crate::language::COMMENT_REGISTRY.get(ext.as_str());
let mut total = 0usize;
let mut code = 0usize;
let mut comment = 0usize;
let mut blank = 0usize;
let mut in_multi_comment = false;
for line in content.lines() {
total += 1;
let trimmed = line.trim();
if trimmed.is_empty() {
if in_multi_comment {
comment += 1;
} else {
blank += 1;
}
continue;
}
if let Some(s) = spec {
if in_multi_comment {
comment += 1;
if let Some((_, end)) = s.multi
&& trimmed.contains(end)
{
in_multi_comment = false;
}
continue;
}
if let Some((start, end)) = s.multi
&& trimmed.starts_with(start)
{
comment += 1;
let ends_on_same_line = if start == end {
trimmed[start.len()..].contains(end)
} else {
trimmed.contains(end)
};
if !ends_on_same_line {
in_multi_comment = true;
}
continue;
}
if let Some(single) = s.single
&& trimmed.starts_with(single)
{
comment += 1;
continue;
}
}
code += 1;
}
(total, code, comment, blank)
}
#[cfg(test)]
fn analyze_file(path: &Path) -> (usize, usize, usize, usize) {
match std::fs::read_to_string(path) {
Ok(s) => analyze_content(&s, path),
Err(_) => (0, 0, 0, 0),
}
}
fn is_binary_file(path: &Path) -> bool {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e.to_lowercase()))
.unwrap_or_default();
if BINARY_EXTENSIONS.contains(ext.as_str()) {
return true;
}
let mut buf = [0u8; 8192];
match std::fs::File::open(path) {
Ok(mut f) => {
use std::io::Read;
let n = f.read(&mut buf).unwrap_or(0);
if n >= 2 && ((buf[0] == 0xFE && buf[1] == 0xFF) || (buf[0] == 0xFF && buf[1] == 0xFE))
{
return false; }
if n >= 4
&& ((buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF)
|| (buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00))
{
return false; }
buf[..n].contains(&0u8)
}
Err(_) => true,
}
}
fn check_git_repo(dir: &Path) -> bool {
Command::new("git")
.args(["rev-parse", "--is-inside-work-tree"])
.current_dir(dir)
.output()
.map(|o| o.status.success())
.unwrap_or(false)
}
fn get_git_files(dir: &Path, locignore: &LocIgnore) -> Vec<PathBuf> {
let mut files = git_ls_files(dir, &["--cached", "--others", "--exclude-standard"]);
if files.is_empty() {
return get_manual_files(dir, locignore, false);
}
if locignore.has_negations() {
let git_ignored = git_ls_files(dir, &["--others", "--ignored", "--exclude-standard"]);
for path in git_ignored {
if !locignore.is_excluded(&path) {
files.push(path);
}
}
}
files.retain(|p| !locignore.is_excluded(p));
files
}
fn git_ls_files(dir: &Path, args: &[&str]) -> Vec<PathBuf> {
let output = Command::new("git")
.arg("ls-files")
.arg("-z")
.args(args)
.current_dir(dir)
.output();
match output {
Ok(out) if out.status.success() => {
let stdout = String::from_utf8_lossy(&out.stdout);
stdout
.split('\0')
.filter(|s| !s.is_empty())
.map(|s| dir.join(s))
.collect()
}
_ => vec![],
}
}
fn get_manual_files(
dir: &Path,
locignore: &LocIgnore,
include_hidden: bool,
) -> Vec<PathBuf> {
use walkdir::WalkDir;
WalkDir::new(dir)
.follow_links(false)
.into_iter()
.filter_entry(move |e| {
if e.depth() == 0 {
return true;
}
let name = e.file_name().to_string_lossy();
if e.file_type().is_dir() {
if EXCLUDED_DIRS.contains(name.as_ref()) || name == ".git" {
return false;
}
if !include_hidden && name != ".well-known" && name.starts_with('.') {
return false;
}
if !locignore.has_negations() && locignore.is_excluded(e.path()) {
return false;
}
true
} else {
include_hidden || !name.starts_with('.')
}
})
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
.filter(|e| !locignore.is_excluded(e.path()))
.map(|e| e.path().to_path_buf())
.collect()
}
fn get_all_git_dates(root: &Path) -> HashMap<PathBuf, DateTime<Utc>> {
let mut map = HashMap::new();
let output = Command::new("git")
.args(["log", "--format=commit %ct", "--name-only"])
.current_dir(root)
.output();
if let Ok(out) = output
&& out.status.success()
{
let stdout = String::from_utf8_lossy(&out.stdout);
let mut current_ts = None;
for line in stdout.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
if let Some(rest) = line.strip_prefix("commit ") {
if let Ok(ts) = rest.parse::<i64>() {
current_ts = Utc.timestamp_opt(ts, 0).single();
}
} else if let Some(ts) = current_ts {
let path = root.join(line);
map.entry(path).or_insert(ts);
}
}
}
map
}
fn get_fs_last_modified(path: &Path) -> Option<DateTime<Utc>> {
path.metadata()
.ok()
.and_then(|m| m.modified().ok())
.and_then(|t| t.duration_since(UNIX_EPOCH).ok())
.and_then(|d| Utc.timestamp_opt(d.as_secs() as i64, 0).single())
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::tempdir;
fn count_lines(path: &Path) -> usize {
analyze_file(path).0
}
#[test]
fn test_count_lines_basic() {
let dir = tempdir().unwrap();
let p = dir.path().join("test.txt");
fs::write(&p, "line1\nline2\nline3\n").unwrap();
assert_eq!(count_lines(&p), 3);
}
#[test]
fn test_count_lines_no_trailing_newline() {
let dir = tempdir().unwrap();
let p = dir.path().join("test.txt");
fs::write(&p, "line1\nline2").unwrap();
assert_eq!(count_lines(&p), 2);
}
#[test]
fn test_count_lines_empty() {
let dir = tempdir().unwrap();
let p = dir.path().join("empty.txt");
fs::write(&p, "").unwrap();
assert_eq!(count_lines(&p), 0);
}
#[test]
fn test_count_lines_single_line_no_newline() {
let dir = tempdir().unwrap();
let p = dir.path().join("single.txt");
fs::write(&p, "only one line").unwrap();
assert_eq!(count_lines(&p), 1);
}
#[test]
fn test_is_binary_file_detection() {
let dir = tempdir().unwrap();
let txt = dir.path().join("plain.txt");
fs::write(&txt, "just some text").unwrap();
assert!(!is_binary_file(&txt));
let bin = dir.path().join("blob.bin");
fs::write(&bin, vec![0u8, 1u8, 2u8]).unwrap();
assert!(is_binary_file(&bin));
let ext_bin = dir.path().join("image.png");
fs::write(&ext_bin, "pretend PNG").unwrap();
assert!(is_binary_file(&ext_bin));
}
#[test]
fn test_is_binary_bom_detection() {
let dir = tempdir().unwrap();
let u16be = dir.path().join("utf16be.txt");
fs::write(&u16be, vec![0xFE, 0xFF, 0x00, 0x61]).unwrap();
assert!(!is_binary_file(&u16be), "UTF-16BE should not be binary");
let u16le = dir.path().join("utf16le.txt");
fs::write(&u16le, vec![0xFF, 0xFE, 0x61, 0x00]).unwrap();
assert!(!is_binary_file(&u16le), "UTF-16LE should not be binary");
let u32le = dir.path().join("utf32le.txt");
fs::write(
&u32le,
vec![0xFF, 0xFE, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00],
)
.unwrap();
assert!(!is_binary_file(&u32le), "UTF-32LE should not be binary");
}
#[test]
fn test_manual_files_with_ignore() {
let dir = tempdir().unwrap();
fs::create_dir(dir.path().join("node_modules")).unwrap();
fs::write(dir.path().join("node_modules/index.js"), "js").unwrap();
fs::write(dir.path().join("keep.rs"), "rust").unwrap();
fs::write(dir.path().join("ignore_me.txt"), "text").unwrap();
fs::write(dir.path().join(".locignore"), "ignore_me.txt\n").unwrap();
let locignore = crate::locignore::LocIgnore::build(dir.path());
let files = get_manual_files(dir.path(), &locignore, false);
let names: HashSet<_> = files
.iter()
.map(|f| f.file_name().unwrap().to_str().unwrap())
.collect();
assert!(names.contains("keep.rs"));
assert!(!names.contains("ignore_me.txt"));
assert!(!names.contains("index.js"));
}
#[test]
fn test_python_multiline_comment_counts() {
let dir = tempdir().unwrap();
let p = dir.path().join("test.py");
fs::write(
&p,
r#"def foo():
"""
This is a docstring.
It spans multiple lines.
"""
return 42
"#,
)
.unwrap();
let (total, code, comment, blank) = analyze_file(&p);
assert_eq!(total, 6);
assert_eq!(code, 2);
assert_eq!(comment, 4);
assert_eq!(blank, 0);
}
#[test]
fn test_python_triple_quote_single_liner() {
let dir = tempdir().unwrap();
let p = dir.path().join("test.py");
fs::write(
&p,
r#"def foo():
"""One liner docstring."""
x = 1
y = 2
"#,
)
.unwrap();
let (_total, code, _comment, _blank) = analyze_file(&p);
assert_eq!(code, 3, "x = 1 and y = 2 must not be swallowed as comments");
}
#[test]
fn test_rust_comment_classification() {
let dir = tempdir().unwrap();
let p = dir.path().join("test.rs");
fs::write(
&p,
r#"// single line comment
fn main() {
/* block comment */
let x = 1; // inline not a comment line
}
"#,
)
.unwrap();
let (total, code, comment, _blank) = analyze_file(&p);
assert_eq!(total, 5);
assert_eq!(comment, 2); assert_eq!(code, 3); }
}