use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileData {
pub path: PathBuf,
pub content: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileCollection {
pub root_path: PathBuf,
pub files: Vec<FileData>,
}
pub struct BatchFileCollector {
max_files_per_batch: usize,
exclude_patterns: Vec<String>,
}
impl Default for BatchFileCollector {
fn default() -> Self {
Self {
max_files_per_batch: 100,
exclude_patterns: vec![],
}
}
}
impl BatchFileCollector {
pub fn new() -> Self {
Self::default()
}
pub fn with_max_files_per_batch(mut self, max: usize) -> Self {
self.max_files_per_batch = max;
self
}
pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
self.exclude_patterns = patterns;
self
}
pub async fn collect_repository(&self, repo_path: &Path) -> Result<Vec<FileCollection>> {
let mut collections = Vec::new();
let mut current_batch = Vec::new();
for entry in WalkDir::new(repo_path) {
let entry = entry?;
if entry.file_type().is_file() {
let path = entry.path();
let should_skip = self.exclude_patterns.iter().any(|pattern| {
glob::Pattern::new(pattern)
.ok()
.map(|p| p.matches_path(path))
.unwrap_or(false)
});
if should_skip {
continue;
}
if let Ok(content) = tokio::fs::read_to_string(path).await {
current_batch.push(FileData {
path: path.to_path_buf(),
content,
});
if current_batch.len() >= self.max_files_per_batch {
collections.push(FileCollection {
root_path: repo_path.to_path_buf(),
files: std::mem::take(&mut current_batch),
});
}
}
}
}
if !current_batch.is_empty() {
collections.push(FileCollection {
root_path: repo_path.to_path_buf(),
files: current_batch,
});
}
Ok(collections)
}
}