use super::file_info::FileInfo;
use super::language::detect_language;
use super::pdf_extractor::extract_pdf_to_markdown;
use anyhow::{Context, Result};
use ignore::WalkBuilder;
use sha2::{Digest, Sha256};
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
pub struct FileWalker {
pub(crate) root: PathBuf,
pub(crate) project: Option<String>,
pub(crate) max_file_size: usize,
pub(crate) include_patterns: Vec<String>,
pub(crate) exclude_patterns: Vec<String>,
cancelled: Option<Arc<AtomicBool>>,
}
impl FileWalker {
pub fn new(root: impl AsRef<Path>, max_file_size: usize) -> Self {
Self {
root: root.as_ref().to_path_buf(),
project: None,
max_file_size,
include_patterns: vec![],
exclude_patterns: vec![],
cancelled: None,
}
}
pub fn with_cancellation_flag(mut self, cancelled: Arc<AtomicBool>) -> Self {
self.cancelled = Some(cancelled);
self
}
fn is_cancelled(&self) -> bool {
self.cancelled
.as_ref()
.is_some_and(|flag| flag.load(Ordering::Relaxed))
}
pub fn with_project(mut self, project: Option<String>) -> Self {
self.project = project;
self
}
pub fn with_patterns(
mut self,
include_patterns: Vec<String>,
exclude_patterns: Vec<String>,
) -> Self {
self.include_patterns = include_patterns;
self.exclude_patterns = exclude_patterns;
self
}
pub fn walk(&self) -> Result<Vec<FileInfo>> {
if !self.root.exists() {
anyhow::bail!("Root directory does not exist: {:?}", self.root);
}
if !self.root.is_dir() {
anyhow::bail!("Root path is not a directory: {:?}", self.root);
}
let mut files = Vec::new();
let walker = WalkBuilder::new(&self.root)
.standard_filters(true) .hidden(false) .git_ignore(true) .git_exclude(true) .git_global(true) .require_git(false) .build();
for entry in walker {
if self.is_cancelled() {
tracing::info!("File walk cancelled after {} files", files.len());
anyhow::bail!("Indexing was cancelled");
}
let entry = entry.context("Failed to read directory entry")?;
let path = entry.path();
if path.is_dir() {
continue;
}
if path.components().any(|c| c.as_os_str() == ".git") {
tracing::debug!("Skipping .git directory file: {:?}", path);
continue;
}
if let Ok(metadata) = fs::metadata(path)
&& metadata.len() > self.max_file_size as u64
{
tracing::debug!("Skipping large file: {:?}", path);
continue;
}
let is_pdf = path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase() == "pdf")
.unwrap_or(false);
if !is_pdf && !self.is_text_file(path)? {
tracing::debug!("Skipping binary file: {:?}", path);
continue;
}
if !self.matches_patterns(path) {
continue;
}
let content = if is_pdf {
match extract_pdf_to_markdown(path) {
Ok(c) => c,
Err(e) => {
tracing::warn!("Failed to extract PDF {:?}: {}", path, e);
continue;
}
}
} else {
match fs::read_to_string(path) {
Ok(c) => c,
Err(e) => {
tracing::debug!(
"Skipping file that can't be read as UTF-8: {:?}: {}",
path,
e
);
continue;
}
}
};
let hash = self.calculate_hash(&content);
let relative_path = path
.strip_prefix(&self.root)
.unwrap_or(path)
.to_string_lossy()
.to_string();
let extension = path.extension().and_then(|e| e.to_str()).map(String::from);
let language = extension.as_ref().and_then(|ext| detect_language(ext));
files.push(FileInfo {
path: path.to_path_buf(),
relative_path,
root_path: self.root.to_string_lossy().to_string(),
project: self.project.clone(),
extension,
language,
content,
hash,
});
}
tracing::info!("Found {} files to index", files.len());
Ok(files)
}
pub(crate) fn is_text_file(&self, path: &Path) -> Result<bool> {
let content = fs::read(path).context("Failed to read file")?;
let non_printable = content
.iter()
.filter(|&&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
.count();
Ok((non_printable as f64 / content.len() as f64) < 0.3)
}
pub(crate) fn matches_patterns(&self, path: &Path) -> bool {
let path_str = path.to_string_lossy();
if !self.include_patterns.is_empty() {
let matches_include = self
.include_patterns
.iter()
.any(|pattern| path_str.contains(pattern));
if !matches_include {
return false;
}
}
if self
.exclude_patterns
.iter()
.any(|pattern| path_str.contains(pattern))
{
return false;
}
true
}
pub(crate) fn calculate_hash(&self, content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
}
#[cfg(test)]
mod tests;