use std::fs; use std::path::{Path, PathBuf};
use rayon::prelude::*; use thiserror::Error; use tree_sitter::Tree; use walkdir::WalkDir;
use crate::{parse_once, LanguageId, ParserError};
#[derive(Debug, Clone)] pub struct FileScanConfig {
pub root: PathBuf, pub follow_symlinks: bool, pub max_file_size: Option<u64>, }
impl FileScanConfig {
pub fn new(root: impl AsRef<Path>) -> Self {
Self {
root: root.as_ref().to_path_buf(), follow_symlinks: false, max_file_size: Some(2 * 1024 * 1024), }
}
}
#[derive(Debug)] pub struct ParsedFile {
pub path: PathBuf, pub language: LanguageId, pub tree: Tree, pub source: String, pub is_test: bool, }
fn is_test_file(path: &Path) -> bool {
let path_str = path.to_string_lossy().to_lowercase();
let test_dir_patterns = [
"/test/", "/tests/", "/spec/", "/specs/",
"/__tests__/", "/__test__/",
"/testing/", "/testcases/",
"/src/test/", "/t/", ];
for pattern in test_dir_patterns {
if path_str.contains(pattern) {
return true;
}
}
if let Some(file_name) = path.file_stem().and_then(|s| s.to_str()) {
let name_lower = file_name.to_lowercase();
if name_lower.starts_with("test_")
|| name_lower.starts_with("test-")
|| name_lower.ends_with("_test")
|| name_lower.ends_with("-test")
|| name_lower.ends_with("test")
|| name_lower.ends_with("_spec")
|| name_lower.ends_with(".spec")
|| name_lower.ends_with(".test")
|| name_lower.ends_with("tests")
|| name_lower.contains("_test_")
|| name_lower.contains("-test-")
{
return true;
}
if name_lower.ends_with("test") && !name_lower.eq("test") {
return true;
}
if name_lower.ends_with("_tests") || name_lower.ends_with("_eunit") {
return true;
}
}
false
}
#[derive(Debug, Error)] pub enum ScannerError {
#[error("walkdir error: {0}")]
Walk(#[from] walkdir::Error),
#[error("io error reading file {path:?}: {source}")]
ReadFile {
path: PathBuf, #[source]
source: std::io::Error, },
#[error("parse error in file {path:?}: {source}")]
Parse {
path: PathBuf, #[source]
source: ParserError, },
}
#[derive(Debug)] struct DiscoveredFile {
path: PathBuf, language: LanguageId, }
fn language_from_extension(path: &Path) -> Option<LanguageId> {
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
match ext.as_str() {
"java" => Some(LanguageId::Java), "js" => Some(LanguageId::JavaScript), "ts" => Some(LanguageId::TypeScript), "tsx" => Some(LanguageId::Tsx), "py" => Some(LanguageId::Python), "rs" => Some(LanguageId::Rust), "go" => Some(LanguageId::Go), "erl" | "hrl" => Some(LanguageId::Erlang), "cs" => Some(LanguageId::CSharp), _ => None, }
}
fn discover_files(config: &FileScanConfig) -> Result<Vec<DiscoveredFile>, ScannerError> {
let mut files = Vec::new();
let walker = WalkDir::new(&config.root).follow_links(config.follow_symlinks);
for entry in walker {
let entry = entry?;
if !entry.file_type().is_file() {
continue; }
let path = entry.into_path();
let language = match language_from_extension(&path) {
Some(lang) => lang, None => continue, };
if let Some(max) = config.max_file_size {
let metadata = fs::metadata(&path).map_err(|source| ScannerError::ReadFile {
path: path.clone(),
source,
})?;
if metadata.len() > max {
continue; }
}
files.push(DiscoveredFile { path, language });
}
Ok(files) }
pub fn scan_and_parse(config: &FileScanConfig) -> Result<Vec<ParsedFile>, ScannerError> {
let files = discover_files(config)?;
let results: Result<Vec<_>, ScannerError> = files
.into_par_iter() .map(|file| {
let source = fs::read_to_string(&file.path).map_err(|source| ScannerError::ReadFile {
path: file.path.clone(),
source,
})?;
let tree =
parse_once(file.language, &source).map_err(|source| ScannerError::Parse {
path: file.path.clone(),
source,
})?;
let is_test = is_test_file(&file.path);
Ok(ParsedFile {
path: file.path,
language: file.language,
tree,
source,
is_test,
})
})
.collect();
results }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn maps_extensions_to_languages() {
let java = Path::new("Foo.java"); assert!(matches!(
language_from_extension(java),
Some(LanguageId::Java)
));
let js = Path::new("a/b/c/app.js"); assert!(matches!(
language_from_extension(js),
Some(LanguageId::JavaScript)
));
let py = Path::new("script.PY"); assert!(matches!(
language_from_extension(py),
Some(LanguageId::Python)
));
let csharp = Path::new("Program.cs"); assert!(matches!(
language_from_extension(csharp),
Some(LanguageId::CSharp)
));
let erl = Path::new("handler.erl"); assert!(matches!(
language_from_extension(erl),
Some(LanguageId::Erlang)
));
let hrl = Path::new("models.hrl"); assert!(matches!(
language_from_extension(hrl),
Some(LanguageId::Erlang)
));
let unknown = Path::new("README.md"); assert!(language_from_extension(unknown).is_none());
}
}