use anyhow::Result;
use rayon::prelude::*;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicUsize, Ordering};
use super::{CodeGraph, ScanProgress};
use crate::diagnostics::{DiagnosticStage, WatchDiagnostic};
use crate::graph::filter::{skip_diagnostic, FileFilter};
use crate::validation::{validate_path_within_root, PathValidationError};
#[derive(Debug, Default)]
pub struct ScanResult {
pub indexed: usize,
pub diagnostics: Vec<WatchDiagnostic>,
}
pub async fn scan_directory_async(
graph: &mut CodeGraph,
dir_path: &Path,
filter: &crate::graph::filter::FileFilter,
progress: Option<&ScanProgress>,
) -> Result<ScanResult> {
use crate::diagnostics::SkipReason;
use crate::indexer::async_io::read_files_async;
let mut candidate_files: Vec<PathBuf> = Vec::new();
let mut diagnostics = Vec::new();
for entry in walkdir::WalkDir::new(dir_path)
.follow_links(false)
.into_iter()
.filter_map(std::result::Result::ok)
{
let path = entry.path();
if path.is_dir() {
continue;
}
match validate_path_within_root(path, dir_path) {
Ok(_) => {
if filter.should_skip(path).is_none() {
candidate_files.push(path.to_path_buf());
} else {
let reason = filter
.should_skip(path)
.unwrap_or(SkipReason::IgnoredInternal);
let rel_path = path
.strip_prefix(dir_path)
.unwrap_or(path)
.to_string_lossy()
.to_string();
diagnostics.push(WatchDiagnostic::skipped(rel_path, reason));
}
}
Err(_e) => {
let rel_path = path
.strip_prefix(dir_path)
.unwrap_or(path)
.to_string_lossy()
.to_string();
diagnostics.push(WatchDiagnostic::skipped(
rel_path,
SkipReason::IgnoredInternal,
));
}
}
}
candidate_files.sort();
let total = candidate_files.len();
let indexed = AtomicUsize::new(0);
let file_contents = read_files_async(candidate_files.clone()).await?;
for (path, content) in file_contents {
let path_str = path.to_string_lossy().to_string();
if let Ok(count) = crate::graph::ops::index_file(graph, &path_str, &content) {
indexed.fetch_add(count, Ordering::SeqCst);
}
if let Some(progress_fn) = progress {
progress_fn(indexed.load(Ordering::SeqCst), total, &path_str);
}
}
Ok(ScanResult {
indexed: indexed.load(Ordering::SeqCst),
diagnostics,
})
}
struct FileReadResult {
path_str: String,
rel_path: String,
source: Option<Vec<u8>>,
error: Option<WatchDiagnostic>,
}
impl FileReadResult {
fn ok(path_str: String, rel_path: String, source: Vec<u8>) -> Self {
Self {
path_str,
rel_path,
source: Some(source),
error: None,
}
}
fn error(rel_path: String, error_msg: String) -> Self {
let diagnostic = WatchDiagnostic::error(rel_path.clone(), DiagnosticStage::Read, error_msg);
Self {
path_str: String::new(),
rel_path,
source: None,
error: Some(diagnostic),
}
}
fn is_error(&self) -> bool {
self.error.is_some()
}
}
pub fn scan_directory_with_filter(
graph: &mut CodeGraph,
dir_path: &Path,
filter: &FileFilter,
progress: Option<&ScanProgress>,
) -> Result<ScanResult> {
let mut candidate_files: Vec<PathBuf> = Vec::new();
let mut diagnostics = Vec::new();
for entry in walkdir::WalkDir::new(dir_path)
.follow_links(false)
.into_iter()
.filter_map(std::result::Result::ok)
{
let path = entry.path();
if path.is_dir() {
continue;
}
match validate_path_within_root(path, dir_path) {
Ok(_) => {
}
Err(PathValidationError::OutsideRoot(_p, _)) => {
let rel_path = path
.strip_prefix(dir_path)
.unwrap_or(path)
.to_string_lossy()
.to_string();
diagnostics.push(WatchDiagnostic::skipped(
rel_path,
crate::diagnostics::SkipReason::IgnoredInternal,
));
continue;
}
Err(PathValidationError::SymlinkEscape(_from, to)) => {
let rel_path = path
.strip_prefix(dir_path)
.unwrap_or(path)
.to_string_lossy()
.to_string();
diagnostics.push(WatchDiagnostic::error(
rel_path,
DiagnosticStage::Read,
format!("symlink escapes root: {}", to),
));
continue;
}
Err(PathValidationError::CannotCanonicalize(_)) => {
continue;
}
Err(PathValidationError::SuspiciousTraversal(p)) => {
diagnostics.push(WatchDiagnostic::error(
p,
DiagnosticStage::Read,
"suspicious traversal pattern".to_string(),
));
continue;
}
}
if let Some(reason) = filter.should_skip(path) {
diagnostics.push(skip_diagnostic(dir_path, path, reason));
continue;
}
candidate_files.push(path.to_path_buf());
}
candidate_files.sort();
let total = candidate_files.len();
let file_metadata: Vec<(PathBuf, String, String)> = candidate_files
.iter()
.map(|path| {
let path_str = crate::validation::normalize_path(path)
.unwrap_or_else(|_| path.to_string_lossy().to_string());
let rel_path = path
.strip_prefix(dir_path)
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_else(|_| path_str.clone());
(path.clone(), path_str, rel_path)
})
.collect();
let read_results: Vec<FileReadResult> = file_metadata
.par_iter() .map(|(path, path_str, rel_path)| {
match std::fs::read(path) {
Ok(source) => FileReadResult::ok(path_str.clone(), rel_path.clone(), source),
Err(e) => FileReadResult::error(rel_path.clone(), e.to_string()),
}
})
.collect();
let indexed_count = AtomicUsize::new(0);
for result in read_results {
let current = indexed_count.fetch_add(1, Ordering::Relaxed) + 1;
if let Some(cb) = progress {
cb(current, total, &result.rel_path);
}
if result.is_error() {
if let Some(err) = result.error {
diagnostics.push(err);
}
continue;
}
let path_str = &result.path_str;
let rel_path = &result.rel_path;
let source = match result.source.as_ref() {
Some(s) => s,
None => {
diagnostics.push(WatchDiagnostic::error(
rel_path.clone(),
DiagnosticStage::Read,
"Source is empty after successful read".to_string(),
));
continue;
}
};
let _ = graph.delete_file(path_str);
match graph.index_file(path_str, source) {
Ok(_) => {}
Err(e) => {
diagnostics.push(WatchDiagnostic::error(
rel_path.clone(),
DiagnosticStage::IndexSymbols,
e.to_string(),
));
continue;
}
}
match graph.index_references(path_str, source) {
Ok(_) => {}
Err(e) => {
diagnostics.push(WatchDiagnostic::error(
rel_path.clone(),
DiagnosticStage::IndexReferences,
e.to_string(),
));
}
}
}
diagnostics.sort();
Ok(ScanResult {
indexed: total,
diagnostics,
})
}
pub fn scan_directory(
graph: &mut CodeGraph,
dir_path: &Path,
progress: Option<&ScanProgress>,
) -> Result<usize> {
let filter = FileFilter::new(dir_path, &[], &[])?;
let result = scan_directory_with_filter(graph, dir_path, &filter, progress)?;
Ok(result.indexed)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_scan_filters_database_files() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let code_rs = temp_dir.path().join("code.rs");
let data_db = temp_dir.path().join("data.db");
let journal = temp_dir.path().join("test.db-journal");
std::fs::write(&code_rs, b"fn test() {}").unwrap();
std::fs::write(&data_db, b"database data").unwrap();
std::fs::write(&journal, b"journal data").unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert_eq!(result.indexed, 1, "Should only scan 1 .rs file");
let symbols = graph.symbols_in_file(code_rs.to_str().unwrap()).unwrap();
assert_eq!(symbols.len(), 1);
assert!(result.diagnostics.len() >= 2);
let db_diag = result
.diagnostics
.iter()
.find(|d| d.path().contains("data.db"));
assert!(db_diag.is_some());
}
#[test]
fn test_scan_with_gitignore() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
std::fs::write(temp_dir.path().join(".gitignore"), "ignored.rs").unwrap();
std::fs::write(temp_dir.path().join("included.rs"), b"fn included() {}").unwrap();
std::fs::write(temp_dir.path().join("ignored.rs"), b"fn ignored() {}").unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert_eq!(result.indexed, 1);
let ignored_diag = result.diagnostics.iter().find(|d| d.path() == "ignored.rs");
assert!(ignored_diag.is_some());
}
#[test]
fn test_scan_with_include_patterns() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
std::fs::create_dir_all(temp_dir.path().join("src")).unwrap();
std::fs::create_dir_all(temp_dir.path().join("tests")).unwrap();
std::fs::write(temp_dir.path().join("src/lib.rs"), b"fn lib() {}").unwrap();
std::fs::write(temp_dir.path().join("tests/test.rs"), b"fn test() {}").unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &["src/**".to_string()], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert_eq!(result.indexed, 1);
let tests_diag = result
.diagnostics
.iter()
.find(|d| d.path().contains("tests"));
assert!(tests_diag.is_some());
}
#[test]
fn test_scan_with_exclude_patterns() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
std::fs::write(temp_dir.path().join("lib.rs"), b"fn lib() {}").unwrap();
std::fs::write(temp_dir.path().join("test.rs"), b"fn test() {}").unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &["**/*test*.rs".to_string()]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert_eq!(result.indexed, 1);
let test_diag = result
.diagnostics
.iter()
.find(|d| d.path().contains("test.rs"));
assert!(test_diag.is_some());
}
#[test]
fn test_scan_continues_on_error() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
std::fs::write(temp_dir.path().join("good.rs"), b"fn good() {}").unwrap();
let bad_file = temp_dir.path().join("bad.rs");
std::fs::write(&bad_file, b"fn bad() {}").unwrap();
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let mut perms = std::fs::metadata(&bad_file).unwrap().permissions();
perms.set_mode(0o000);
std::fs::set_permissions(&bad_file, perms).unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert!(result.indexed >= 1);
let bad_diag = result
.diagnostics
.iter()
.find(|d| d.path().contains("bad.rs"));
assert!(bad_diag.is_some());
let mut perms = std::fs::metadata(&bad_file).unwrap().permissions();
perms.set_mode(0o644);
std::fs::set_permissions(&bad_file, perms).unwrap();
}
#[cfg(not(unix))]
{
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert_eq!(result.indexed, 2);
}
}
#[test]
fn test_diagnostics_sorted() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
std::fs::write(temp_dir.path().join(".gitignore"), "*.rs\n").unwrap();
std::fs::write(temp_dir.path().join("c.rs"), b"").unwrap();
std::fs::write(temp_dir.path().join("a.rs"), b"").unwrap();
std::fs::write(temp_dir.path().join("b.rs"), b"").unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
let mut sorted_diags = result.diagnostics.clone();
sorted_diags.sort();
assert!(!sorted_diags.is_empty());
let mut sorted_again = sorted_diags.clone();
sorted_again.sort();
assert_eq!(sorted_diags, sorted_again);
assert!(sorted_diags.iter().any(|d| d.path() == "a.rs"));
assert!(sorted_diags.iter().any(|d| d.path() == "b.rs"));
assert!(sorted_diags.iter().any(|d| d.path() == "c.rs"));
}
#[test]
fn test_scan_rejects_path_traversal() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
let subdir = temp_dir.path().join("src");
std::fs::create_dir(&subdir).unwrap();
std::fs::write(subdir.join("valid.rs"), b"fn valid() {}").unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert_eq!(result.indexed, 1);
}
#[test]
fn test_scan_with_symlink_to_outside() {
let temp_dir = tempfile::TempDir::new().unwrap();
let outside_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
let outside_file = outside_dir.path().join("outside.rs");
std::fs::write(&outside_file, b"fn outside() {}").unwrap();
let symlink = temp_dir.path().join("link.rs");
#[cfg(unix)]
std::os::unix::fs::symlink(&outside_file, &symlink).unwrap();
#[cfg(windows)]
std::os::windows::fs::symlink_file(&outside_file, &symlink).unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let _result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
#[cfg(any(unix, windows))]
{
let symbols = graph.symbols_in_file(outside_file.to_str().unwrap());
assert!(symbols.is_err() || symbols.unwrap().is_empty());
}
}
#[test]
fn test_scan_continues_after_traversal_rejection() {
let temp_dir = tempfile::TempDir::new().unwrap();
let db_path = temp_dir.path().join("test.db");
std::fs::write(temp_dir.path().join("good.rs"), b"fn good() {}").unwrap();
std::fs::write(temp_dir.path().join("better.rs"), b"fn better() {}").unwrap();
let mut graph = crate::CodeGraph::open(&db_path).unwrap();
let filter = FileFilter::new(temp_dir.path(), &[], &[]).unwrap();
let result =
scan_directory_with_filter(&mut graph, temp_dir.path(), &filter, None).unwrap();
assert_eq!(result.indexed, 2);
}
}