use std::path::{Path, PathBuf};
pub const ALWAYS_SKIP_DIRS: &[&str] = &[
"node_modules",
"vendor",
".git",
".svn",
".hg",
"__pycache__",
];
pub const ROOT_ONLY_SKIP_DIRS: &[&str] = &["build", "dist", "target", "cache", "tmp"];
pub const SOURCE_EXTENSIONS: &[&str] = &[
"rs", "php", "js", "jsx", "ts", "tsx", "mjs", "json", "toml", "yaml", "yml", "md", "txt", "sh",
"bash", "py", "rb", "go", "swift", "kt", "java", "c", "cpp", "h", "lock",
];
#[derive(Debug, Clone, Default)]
pub enum ExtensionFilter {
All,
Only(Vec<String>),
Except(Vec<String>),
#[default]
SourceDefaults,
}
#[derive(Debug, Clone, Default)]
pub struct ScanConfig {
pub extra_skip_dirs: Vec<String>,
pub extra_root_skip_dirs: Vec<String>,
pub extensions: ExtensionFilter,
pub skip_hidden: bool,
}
pub fn walk_files(root: &Path, config: &ScanConfig) -> Vec<PathBuf> {
let mut files = Vec::new();
walk_recursive(root, root, config, &mut files);
files
}
fn walk_recursive(dir: &Path, root: &Path, config: &ScanConfig, files: &mut Vec<PathBuf>) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
let is_root = dir == root;
for entry in entries.flatten() {
let path = entry.path();
let name = path
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_default();
if path.is_dir() {
if config.skip_hidden && name.starts_with('.') {
continue;
}
if ALWAYS_SKIP_DIRS.contains(&name.as_str()) {
continue;
}
if config
.extra_skip_dirs
.iter()
.any(|d| d.as_str() == name.as_str())
{
continue;
}
if is_root {
if ROOT_ONLY_SKIP_DIRS.contains(&name.as_str()) {
continue;
}
if config
.extra_root_skip_dirs
.iter()
.any(|d| d.as_str() == name.as_str())
{
continue;
}
}
walk_recursive(&path, root, config, files);
} else {
if config.skip_hidden && name.starts_with('.') {
continue;
}
if matches_extension(&path, &config.extensions) {
files.push(path);
}
}
}
}
fn matches_extension(path: &Path, filter: &ExtensionFilter) -> bool {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
match filter {
ExtensionFilter::All => true,
ExtensionFilter::Only(exts) => exts.iter().any(|e| e.as_str() == ext),
ExtensionFilter::Except(exts) => !exts.iter().any(|e| e.as_str() == ext),
ExtensionFilter::SourceDefaults => SOURCE_EXTENSIONS.contains(&ext),
}
}
fn is_boundary_char(c: u8) -> bool {
!c.is_ascii_alphanumeric() && c != b'_'
}
#[derive(Debug, Clone)]
pub struct Match {
pub file: String,
pub line: usize,
pub column: usize,
pub matched: String,
pub context: String,
}
pub fn find_boundary_matches(text: &str, term: &str) -> Vec<usize> {
let text_bytes = text.as_bytes();
let term_bytes = term.as_bytes();
let term_len = term_bytes.len();
let text_len = text_bytes.len();
let mut matches = Vec::new();
if term_len == 0 || term_len > text_len {
return matches;
}
let mut start = 0;
while let Some(pos) = text[start..].find(term) {
let abs = start + pos;
let end = abs + term_len;
let left_ok = abs == 0
|| is_boundary_char(text_bytes[abs - 1])
|| text_bytes[abs - 1] == b'_'
|| (text_bytes[abs].is_ascii_uppercase()
&& (text_bytes[abs - 1].is_ascii_lowercase()
|| text_bytes[abs - 1].is_ascii_digit()))
|| (abs >= 2
&& text_bytes[abs].is_ascii_uppercase()
&& text_bytes[abs - 1].is_ascii_uppercase()
&& term_len > 1
&& term_bytes[1].is_ascii_lowercase());
let right_ok = end >= text_len || {
let next = text_bytes[end];
is_boundary_char(next) || next.is_ascii_uppercase() || next == b'_'
};
if left_ok && right_ok {
matches.push(abs);
}
start = abs + 1;
}
matches
}
pub fn find_literal_matches(text: &str, term: &str) -> Vec<usize> {
let mut matches = Vec::new();
let term_len = term.len();
if term_len == 0 {
return matches;
}
let mut start = 0;
while let Some(pos) = text[start..].find(term) {
let abs = start + pos;
matches.push(abs);
start = abs + 1;
}
matches
}
pub fn find_case_insensitive_matches(text: &str, term: &str) -> Vec<(usize, String)> {
let text_lower = text.to_lowercase();
let term_lower = term.to_lowercase();
let term_len = term.len();
let mut matches = Vec::new();
if term_len == 0 || term_len > text.len() {
return matches;
}
let mut start = 0;
while let Some(pos) = text_lower[start..].find(&term_lower) {
let abs = start + pos;
let actual = &text[abs..abs + term_len];
matches.push((abs, actual.to_string()));
start = abs + 1;
}
matches
}
#[derive(Debug, Clone)]
pub enum SearchMode {
Boundary,
Literal,
CaseInsensitive,
}
pub fn search(root: &Path, term: &str, mode: &SearchMode, config: &ScanConfig) -> Vec<Match> {
let files = walk_files(root, config);
let mut results = Vec::new();
for file_path in &files {
let Ok(content) = std::fs::read_to_string(file_path) else {
continue;
};
let relative = file_path
.strip_prefix(root)
.unwrap_or(file_path)
.to_string_lossy()
.to_string();
for (line_num, line) in content.lines().enumerate() {
match mode {
SearchMode::Boundary => {
for pos in find_boundary_matches(line, term) {
results.push(Match {
file: relative.clone(),
line: line_num + 1,
column: pos + 1,
matched: term.to_string(),
context: line.to_string(),
});
}
}
SearchMode::Literal => {
for pos in find_literal_matches(line, term) {
results.push(Match {
file: relative.clone(),
line: line_num + 1,
column: pos + 1,
matched: term.to_string(),
context: line.to_string(),
});
}
}
SearchMode::CaseInsensitive => {
for (pos, actual) in find_case_insensitive_matches(line, term) {
results.push(Match {
file: relative.clone(),
line: line_num + 1,
column: pos + 1,
matched: actual,
context: line.to_string(),
});
}
}
}
}
}
results
}
pub fn discover_casing(root: &Path, term: &str, config: &ScanConfig) -> Vec<(String, usize)> {
let matches = search(root, term, &SearchMode::CaseInsensitive, config);
let mut casing_counts: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for m in &matches {
*casing_counts.entry(m.matched.clone()).or_insert(0) += 1;
}
let mut result: Vec<(String, usize)> = casing_counts.into_iter().collect();
result.sort_by(|a, b| b.1.cmp(&a.1)); result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn walk_files_skips_always_dirs() {
let dir = std::env::temp_dir().join("homeboy_scan_always_skip_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(dir.join("node_modules"));
let _ = std::fs::create_dir_all(dir.join("vendor"));
let _ = std::fs::create_dir_all(dir.join("src"));
std::fs::write(dir.join("node_modules/lib.js"), "x").unwrap();
std::fs::write(dir.join("vendor/lib.php"), "x").unwrap();
std::fs::write(dir.join("src/main.rs"), "x").unwrap();
let files = walk_files(&dir, &ScanConfig::default());
let names: Vec<String> = files
.iter()
.map(|f| f.file_name().unwrap().to_string_lossy().to_string())
.collect();
assert!(names.contains(&"main.rs".to_string()));
assert!(!names.contains(&"lib.js".to_string()));
assert!(!names.contains(&"lib.php".to_string()));
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn walk_files_root_only_skip_at_root() {
let dir = std::env::temp_dir().join("homeboy_scan_root_skip_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(dir.join("build"));
let _ = std::fs::create_dir_all(dir.join("scripts/build"));
std::fs::write(dir.join("build/output.rs"), "x").unwrap();
std::fs::write(dir.join("scripts/build/setup.sh"), "x").unwrap();
let files = walk_files(&dir, &ScanConfig::default());
let names: Vec<String> = files
.iter()
.map(|f| f.file_name().unwrap().to_string_lossy().to_string())
.collect();
assert!(
names.contains(&"setup.sh".to_string()),
"Should find scripts/build/setup.sh"
);
assert!(
!names.contains(&"output.rs".to_string()),
"Should skip root-level build/"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn walk_files_extension_filter_only() {
let dir = std::env::temp_dir().join("homeboy_scan_ext_only_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(&dir);
std::fs::write(dir.join("main.rs"), "x").unwrap();
std::fs::write(dir.join("style.css"), "x").unwrap();
std::fs::write(dir.join("readme.md"), "x").unwrap();
let config = ScanConfig {
extensions: ExtensionFilter::Only(vec!["rs".to_string(), "md".to_string()]),
..Default::default()
};
let files = walk_files(&dir, &config);
let names: Vec<String> = files
.iter()
.map(|f| f.file_name().unwrap().to_string_lossy().to_string())
.collect();
assert!(names.contains(&"main.rs".to_string()));
assert!(names.contains(&"readme.md".to_string()));
assert!(!names.contains(&"style.css".to_string()));
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn walk_files_skip_hidden() {
let dir = std::env::temp_dir().join("homeboy_scan_hidden_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(dir.join(".hidden"));
std::fs::write(dir.join(".hidden/secret.rs"), "x").unwrap();
std::fs::write(dir.join(".dotfile.rs"), "x").unwrap();
std::fs::write(dir.join("visible.rs"), "x").unwrap();
let config = ScanConfig {
skip_hidden: true,
..Default::default()
};
let files = walk_files(&dir, &config);
let names: Vec<String> = files
.iter()
.map(|f| f.file_name().unwrap().to_string_lossy().to_string())
.collect();
assert!(names.contains(&"visible.rs".to_string()));
assert!(!names.contains(&"secret.rs".to_string()));
assert!(!names.contains(&".dotfile.rs".to_string()));
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn walk_files_pycache_always_skipped() {
let dir = std::env::temp_dir().join("homeboy_scan_pycache_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(dir.join("src/__pycache__"));
std::fs::write(dir.join("src/__pycache__/module.py"), "x").unwrap();
std::fs::write(dir.join("src/main.py"), "x").unwrap();
let files = walk_files(&dir, &ScanConfig::default());
let names: Vec<String> = files
.iter()
.map(|f| f.file_name().unwrap().to_string_lossy().to_string())
.collect();
assert!(names.contains(&"main.py".to_string()));
assert!(!names.contains(&"module.py".to_string()));
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn boundary_matches_word_boundaries() {
assert_eq!(find_boundary_matches("pub mod widget;", "widget"), vec![8]);
assert_eq!(find_boundary_matches("load_widget", "widget"), vec![5]);
assert_eq!(find_boundary_matches("WIDGET_DIR", "WIDGET"), vec![0]);
}
#[test]
fn boundary_rejects_partial_match() {
assert!(find_boundary_matches("widgetry", "widget").is_empty());
assert!(find_boundary_matches("rewidget", "widget").is_empty());
}
#[test]
fn boundary_matches_camel_case() {
assert_eq!(find_boundary_matches("WidgetManifest", "Widget"), vec![0]);
assert_eq!(find_boundary_matches("loadWidget", "Widget"), vec![4]);
}
#[test]
fn boundary_matches_consecutive_uppercase() {
assert_eq!(find_boundary_matches("WPAgent", "Agent"), vec![2]);
assert_eq!(find_boundary_matches("WPAgent", "WP"), vec![0]);
assert_eq!(find_boundary_matches("XMLParser", "XML"), vec![0]);
assert_eq!(find_boundary_matches("XMLParser", "Parser"), vec![3]);
}
#[test]
fn literal_matches_exact_substring() {
assert_eq!(find_literal_matches("the-widget-plugin", "widget"), vec![4]);
assert_eq!(find_literal_matches("widgetry", "widget"), vec![0]);
}
#[test]
fn case_insensitive_finds_different_casing() {
let matches = find_case_insensitive_matches("namespace WPAgent;", "wpagent");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].0, 10); assert_eq!(matches[0].1, "WPAgent"); }
#[test]
fn case_insensitive_finds_multiple_casings() {
let text = "WPAgent and wpagent and Wpagent";
let matches = find_case_insensitive_matches(text, "wpagent");
assert_eq!(matches.len(), 3);
assert_eq!(matches[0].1, "WPAgent");
assert_eq!(matches[1].1, "wpagent");
assert_eq!(matches[2].1, "Wpagent");
}
#[test]
fn discover_casing_finds_actual_convention() {
let dir = std::env::temp_dir().join("homeboy_scan_discover_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(&dir);
std::fs::write(
dir.join("test.php"),
"namespace WPAgent;\nuse WPAgent\\Tools;\nclass WPAgent_Loader {}\n",
)
.unwrap();
let config = ScanConfig::default();
let casings = discover_casing(&dir, "wpagent", &config);
assert!(!casings.is_empty());
assert_eq!(casings[0].0, "WPAgent");
assert_eq!(casings[0].1, 3);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn search_boundary_mode() {
let dir = std::env::temp_dir().join("homeboy_scan_search_boundary_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(&dir);
std::fs::write(
dir.join("test.rs"),
"fn widget_init() {}\nlet widgetry = true;\n",
)
.unwrap();
let config = ScanConfig::default();
let results = search(&dir, "widget", &SearchMode::Boundary, &config);
assert_eq!(results.len(), 1);
assert_eq!(results[0].line, 1);
assert_eq!(results[0].matched, "widget");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn search_literal_mode() {
let dir = std::env::temp_dir().join("homeboy_scan_search_literal_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(&dir);
std::fs::write(
dir.join("test.rs"),
"fn widget_init() {}\nlet widgetry = true;\n",
)
.unwrap();
let config = ScanConfig::default();
let results = search(&dir, "widget", &SearchMode::Literal, &config);
assert_eq!(results.len(), 2);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn search_case_insensitive_mode() {
let dir = std::env::temp_dir().join("homeboy_scan_search_ci_test");
let _ = std::fs::remove_dir_all(&dir);
let _ = std::fs::create_dir_all(&dir);
std::fs::write(
dir.join("test.php"),
"namespace WPAgent;\nconst WP_AGENT = true;\n",
)
.unwrap();
let config = ScanConfig::default();
let results = search(&dir, "wpagent", &SearchMode::CaseInsensitive, &config);
assert_eq!(results.len(), 1); assert_eq!(results[0].matched, "WPAgent");
let _ = std::fs::remove_dir_all(&dir);
}
}