impl SATDDetector {
pub(crate) async fn find_source_files(
&self,
root: &Path,
) -> Result<Vec<PathBuf>, TemplateError> {
if let Ok(output) = tokio::process::Command::new("git")
.args(["ls-files", "--cached", "--others", "--exclude-standard"])
.current_dir(root)
.output()
.await
{
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
let files: Vec<PathBuf> = stdout
.lines()
.filter(|line| !line.is_empty())
.map(|line| root.join(line))
.filter(|path| self.is_valid_source_file(path))
.collect();
if !files.is_empty() {
return Ok(files);
}
}
}
let mut files = Vec::new();
self.collect_files_recursive(root, &mut files).await?;
Ok(files)
}
fn collect_files_recursive<'a>(
&'a self,
dir: &'a Path,
files: &'a mut Vec<PathBuf>,
) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<(), TemplateError>> + Send + 'a>>
{
Box::pin(async move {
if !dir.is_dir() {
return Ok(());
}
let mut entries = tokio::fs::read_dir(dir).await.map_err(TemplateError::Io)?;
while let Some(entry) = entries.next_entry().await.map_err(TemplateError::Io)? {
let path = entry.path();
self.process_directory_entry(&path, files).await?;
}
Ok(())
})
}
async fn process_directory_entry(
&self,
path: &Path,
files: &mut Vec<PathBuf>,
) -> Result<(), TemplateError> {
if path.is_dir() {
self.process_subdirectory(path, files).await
} else {
self.process_file(path, files);
Ok(())
}
}
async fn process_subdirectory(
&self,
path: &Path,
files: &mut Vec<PathBuf>,
) -> Result<(), TemplateError> {
if self.should_skip_directory(path) {
return Ok(());
}
self.collect_files_recursive(path, files).await
}
fn should_skip_directory(&self, path: &Path) -> bool {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
self.is_excluded_directory_name(name)
} else {
false
}
}
fn is_excluded_directory_name(&self, name: &str) -> bool {
name.starts_with('.') || self.is_common_build_directory(name)
}
fn is_common_build_directory(&self, name: &str) -> bool {
[
"target",
"node_modules",
"dist",
"build",
"__pycache__",
"book",
]
.contains(&name)
}
fn process_file(&self, path: &Path, files: &mut Vec<PathBuf>) {
if self.is_valid_source_file(path) {
files.push(path.to_path_buf());
}
}
fn is_valid_source_file(&self, path: &Path) -> bool {
self.is_source_file(path) && !self.is_test_file(path)
}
pub(crate) fn is_source_file(&self, path: &Path) -> bool {
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
matches!(
ext,
"rs" | "py"
| "js"
| "ts"
| "jsx"
| "tsx"
| "java"
| "cpp"
| "c"
| "h"
| "hpp"
| "cs"
| "go"
| "php"
| "rb"
| "swift"
| "kt"
| "scala"
| "clj"
| "hs"
| "ml"
| "elm"
)
} else {
false
}
}
pub(crate) fn is_test_file(&self, path: &Path) -> bool {
let path_str = path.to_string_lossy();
if path_str.contains("/tests/")
|| path_str.contains("/test/")
|| path_str.contains("\\tests\\")
|| path_str.contains("\\test\\")
{
return true;
}
if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
file_name.contains("test")
|| file_name.contains("spec")
|| file_name.ends_with("_test.rs")
|| file_name.ends_with("_test.py")
|| file_name.ends_with("_test.js")
|| file_name.ends_with("_test.ts")
|| file_name.ends_with(".test.js")
|| file_name.ends_with(".test.ts")
|| file_name.ends_with(".spec.js")
|| file_name.ends_with(".spec.ts")
} else {
false
}
}
pub(crate) fn should_exclude_file(&self, file_path: &Path) -> bool {
let path_str = file_path.to_string_lossy();
self.is_satd_analysis_tool(&path_str)
|| self.is_build_or_config_file(&path_str)
|| self.is_example_or_demo(&path_str)
|| self.is_fuzz_target(&path_str)
|| self.is_generated_or_vendor(&path_str)
}
fn is_satd_analysis_tool(&self, path_str: &str) -> bool {
path_str.contains("satd_detector")
|| path_str.contains("satd_property_tests")
|| path_str.contains("quality_proxy")
|| (path_str.contains("test") && path_str.contains("satd"))
}
fn is_build_or_config_file(&self, path_str: &str) -> bool {
path_str.contains("/build.rs")
|| path_str.contains("/Cargo.toml")
|| path_str.contains(".gitignore")
|| path_str.contains("README")
}
fn is_example_or_demo(&self, path_str: &str) -> bool {
path_str.contains("/examples/") || path_str.contains("/demo/") || path_str.contains("_demo")
}
fn is_fuzz_target(&self, path_str: &str) -> bool {
path_str.contains("/fuzz/") || path_str.contains("fuzz_targets")
}
fn is_generated_or_vendor(&self, path_str: &str) -> bool {
path_str.contains("/target/")
|| path_str.contains("/vendor/")
|| path_str.contains("/node_modules/")
|| path_str.contains("/book/")
|| path_str.contains(".generated")
}
pub(crate) fn is_minified_or_vendor_file(&self, path: &Path) -> bool {
if path.components().any(|c| c.as_os_str() == "vendor") {
return true;
}
if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
file_name.contains(".min.")
|| file_name.contains(".bundle.")
|| file_name.contains("-min.")
|| file_name.contains(".production.")
|| file_name.ends_with(".min.js")
|| file_name.ends_with(".min.css")
|| file_name.ends_with(".bundle.js")
|| file_name.ends_with(".production.js")
} else {
false
}
}
pub(crate) async fn is_likely_minified_content(&self, path: &Path) -> bool {
use tokio::io::{AsyncBufReadExt, BufReader};
match tokio::fs::File::open(path).await {
Ok(file) => {
let reader = BufReader::new(file);
let mut lines = reader.lines();
for _ in 0..3 {
match lines.next_line().await {
Ok(Some(line)) => {
if line.len() > 5000 {
return true; }
}
Ok(None) => break,
Err(_) => return false,
}
}
false
}
Err(_) => false,
}
}
}