ai_context_gen/scanner.rs
1//! Repository scanning module for the AI Context Generator.
2//!
3//! This module provides functionality to scan and analyze repository structure,
4//! extracting metadata, file information, and project organization.
5
6use anyhow::Result;
7use serde::{Deserialize, Serialize};
8use std::fs;
9use std::path::{Path, PathBuf};
10use walkdir::WalkDir;
11
12use crate::config::{Config, IGNORED_DIRS, IGNORED_FILES, SUPPORTED_EXTENSIONS};
13
14/// Information about a single file in the repository.
15///
16/// Contains both metadata and content for files that are included in the analysis.
17/// This structure is used to pass file information between scanning and generation phases.
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct FileInfo {
20 /// Absolute path to the file on the filesystem.
21 pub path: PathBuf,
22
23 /// Path relative to the repository root.
24 ///
25 /// This is used for display purposes in the generated context.
26 pub relative_path: PathBuf,
27
28 /// Complete content of the file as a string.
29 ///
30 /// For text files, this contains the entire file content.
31 /// Binary files are not processed and won't appear in scan results.
32 pub content: String,
33
34 /// Type classification of the file based on its extension.
35 pub file_type: FileType,
36
37 /// Size of the file in bytes.
38 pub size: u64,
39}
40
41/// Classification of file types supported by the generator.
42///
43/// Different file types receive different processing and priority levels
44/// during context generation.
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub enum FileType {
47 /// Rust source files (`.rs` extension).
48 ///
49 /// These files receive full AST analysis to extract structural information
50 /// about modules, functions, structs, enums, and implementations.
51 Rust,
52
53 /// Markdown documentation files (`.md` extension).
54 ///
55 /// These files are included as high-priority documentation content.
56 Markdown,
57}
58
59/// Complete result of repository scanning operation.
60///
61/// Contains all information gathered during the scanning phase, including
62/// individual files, project structure, and metadata.
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct ScanResult {
65 /// List of all files that were processed during scanning.
66 ///
67 /// Only files with supported extensions that passed filtering are included.
68 pub files: Vec<FileInfo>,
69
70 /// Structural information about the project organization.
71 pub project_structure: ProjectStructure,
72
73 /// Metadata extracted from project configuration files.
74 pub metadata: ProjectMetadata,
75}
76
77/// Information about the overall structure and organization of the project.
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct ProjectStructure {
80 /// String representation of the project's file tree.
81 ///
82 /// This is formatted as a text-based tree structure suitable for
83 /// inclusion in markdown documentation.
84 pub tree: String,
85
86 /// Total number of files that were processed.
87 pub total_files: usize,
88
89 /// Combined size of all processed files in bytes.
90 pub total_size: u64,
91}
92
93/// Project metadata extracted from configuration files and repository structure.
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct ProjectMetadata {
96 /// Name of the project.
97 ///
98 /// Extracted from `Cargo.toml` if available, otherwise derived from
99 /// the repository directory name.
100 pub name: String,
101
102 /// Project description, if available.
103 ///
104 /// Extracted from `Cargo.toml` description field or README.md content.
105 pub description: Option<String>,
106
107 /// List of main dependencies.
108 ///
109 /// Extracted from the `[dependencies]` section of `Cargo.toml`.
110 pub dependencies: Vec<String>,
111
112 /// Rust version or project version.
113 ///
114 /// Extracted from `Cargo.toml` version field.
115 pub rust_version: Option<String>,
116}
117
118/// Repository scanner that processes project files and structure.
119///
120/// The scanner walks through the repository directory, identifies relevant files,
121/// extracts their content, and gathers project metadata.
122///
123/// # Examples
124///
125/// ```rust
126/// use ai_context_gen::{Config, RepositoryScanner};
127/// use std::path::PathBuf;
128///
129/// # async fn example() -> anyhow::Result<()> {
130/// let config = Config {
131/// repo_path: PathBuf::from("."),
132/// max_tokens: 50000,
133/// output_file: "context.md".to_string(),
134/// include_hidden: false,
135/// include_deps: true,
136/// };
137///
138/// let scanner = RepositoryScanner::new(config);
139/// let scan_result = scanner.scan().await?;
140///
141/// println!("Found {} files", scan_result.files.len());
142/// # Ok(())
143/// # }
144/// ```
145pub struct RepositoryScanner {
146 config: Config,
147}
148
149impl RepositoryScanner {
150 /// Creates a new repository scanner with the given configuration.
151 ///
152 /// # Arguments
153 ///
154 /// * `config` - Configuration specifying scanning behavior and output options
155 ///
156 /// # Examples
157 ///
158 /// ```rust
159 /// use ai_context_gen::{Config, RepositoryScanner};
160 ///
161 /// let config = Config::default();
162 /// let scanner = RepositoryScanner::new(config);
163 /// ```
164 pub fn new(config: Config) -> Self {
165 Self { config }
166 }
167
168 /// Performs a complete scan of the repository.
169 ///
170 /// This method walks through the repository directory structure, processes
171 /// all supported files, extracts project metadata, and builds a comprehensive
172 /// scan result.
173 ///
174 /// # Returns
175 ///
176 /// A `ScanResult` containing all discovered files, project structure, and metadata.
177 ///
178 /// # Errors
179 ///
180 /// Returns an error if:
181 /// - The repository path doesn't exist or isn't accessible
182 /// - File system errors occur during scanning
183 /// - Files can't be read or parsed
184 ///
185 /// # Examples
186 ///
187 /// ```rust
188 /// use ai_context_gen::{Config, RepositoryScanner};
189 ///
190 /// # async fn example() -> anyhow::Result<()> {
191 /// let config = Config::default();
192 /// let scanner = RepositoryScanner::new(config);
193 /// let result = scanner.scan().await?;
194 ///
195 /// println!("Scanned {} files", result.files.len());
196 /// println!("Total size: {} bytes", result.project_structure.total_size);
197 /// # Ok(())
198 /// # }
199 /// ```
200 pub async fn scan(&self) -> Result<ScanResult> {
201 let mut files = Vec::new();
202 let mut total_size = 0u64;
203
204 for entry in WalkDir::new(&self.config.repo_path)
205 .into_iter()
206 .filter_entry(|e| self.should_include_path(e.path()))
207 {
208 let entry = entry?;
209 let path = entry.path();
210
211 if path.is_file() {
212 if let Some(file_info) = self.process_file(path).await? {
213 total_size += file_info.size;
214 files.push(file_info);
215 }
216 }
217 }
218
219 let project_structure = self.build_project_structure(&files, total_size)?;
220 let metadata = self.extract_project_metadata().await?;
221
222 Ok(ScanResult {
223 files,
224 project_structure,
225 metadata,
226 })
227 }
228
229 /// Determines whether a path should be included in the scan.
230 ///
231 /// This method applies filtering rules based on the configuration and
232 /// predefined ignore lists to determine if a file or directory should
233 /// be processed.
234 ///
235 /// # Arguments
236 ///
237 /// * `path` - The path to evaluate for inclusion
238 ///
239 /// # Returns
240 ///
241 /// `true` if the path should be included, `false` otherwise
242 fn should_include_path(&self, path: &Path) -> bool {
243 let path_str = path.to_string_lossy();
244
245 // Ignore hidden directories if not configured to include them
246 if !self.config.include_hidden && path_str.contains("/.") {
247 return false;
248 }
249
250 // Ignore specific directories
251 for ignored_dir in IGNORED_DIRS {
252 if path_str.contains(ignored_dir) {
253 return false;
254 }
255 }
256
257 // If it's a file, check if it's supported
258 if path.is_file() {
259 let filename = path.file_name().unwrap_or_default().to_string_lossy();
260
261 // Ignore specific files
262 if IGNORED_FILES.contains(&filename.as_ref()) {
263 return false;
264 }
265
266 // Check extension
267 if let Some(ext) = path.extension() {
268 let ext_str = format!(".{}", ext.to_string_lossy());
269 return SUPPORTED_EXTENSIONS.contains(&ext_str.as_str());
270 }
271
272 return false;
273 }
274
275 true
276 }
277
278 /// Processes a single file and extracts its information.
279 ///
280 /// Reads the file content, determines its type based on extension,
281 /// and creates a `FileInfo` structure with all relevant metadata.
282 ///
283 /// # Arguments
284 ///
285 /// * `path` - Path to the file to process
286 ///
287 /// # Returns
288 ///
289 /// `Some(FileInfo)` if the file was successfully processed and should be included,
290 /// `None` if the file should be skipped
291 ///
292 /// # Errors
293 ///
294 /// Returns an error if the file cannot be read or metadata cannot be accessed
295 async fn process_file(&self, path: &Path) -> Result<Option<FileInfo>> {
296 let content = fs::read_to_string(path)?;
297 let metadata = fs::metadata(path)?;
298
299 let file_type = match path.extension().and_then(|ext| ext.to_str()) {
300 Some("rs") => FileType::Rust,
301 Some("md") => FileType::Markdown,
302 _ => return Ok(None),
303 };
304
305 let relative_path = path
306 .strip_prefix(&self.config.repo_path)
307 .unwrap_or(path)
308 .to_path_buf();
309
310 Ok(Some(FileInfo {
311 path: path.to_path_buf(),
312 relative_path,
313 content,
314 file_type,
315 size: metadata.len(),
316 }))
317 }
318
319 fn build_project_structure(
320 &self,
321 files: &[FileInfo],
322 total_size: u64,
323 ) -> Result<ProjectStructure> {
324 let mut tree = String::new();
325 let mut paths: Vec<_> = files.iter().map(|f| &f.relative_path).collect();
326 paths.sort();
327
328 tree.push_str("```\n");
329 for (i, path) in paths.iter().enumerate() {
330 let depth = path.components().count() - 1;
331 let indent = "│ ".repeat(depth);
332 let connector = if i == paths.len() - 1 {
333 "└── "
334 } else {
335 "├── "
336 };
337
338 tree.push_str(&format!("{}{}{}\n", indent, connector, path.display()));
339 }
340 tree.push_str("```\n");
341
342 Ok(ProjectStructure {
343 tree,
344 total_files: files.len(),
345 total_size,
346 })
347 }
348
349 async fn extract_project_metadata(&self) -> Result<ProjectMetadata> {
350 let cargo_toml_path = self.config.repo_path.join("Cargo.toml");
351 let readme_path = self.config.repo_path.join("README.md");
352
353 let mut metadata = ProjectMetadata {
354 name: self
355 .config
356 .repo_path
357 .file_name()
358 .unwrap_or_default()
359 .to_string_lossy()
360 .to_string(),
361 description: None,
362 dependencies: Vec::new(),
363 rust_version: None,
364 };
365
366 // Extract information from Cargo.toml
367 if cargo_toml_path.exists() {
368 let cargo_content = fs::read_to_string(&cargo_toml_path)?;
369 self.parse_cargo_toml(&cargo_content, &mut metadata)?;
370 }
371
372 // Extract description from README.md
373 if readme_path.exists() {
374 let readme_content = fs::read_to_string(&readme_path)?;
375 metadata.description = self.extract_description_from_readme(&readme_content);
376 }
377
378 Ok(metadata)
379 }
380
381 fn parse_cargo_toml(&self, content: &str, metadata: &mut ProjectMetadata) -> Result<()> {
382 let lines: Vec<&str> = content.lines().collect();
383 let mut in_package = false;
384 let mut in_dependencies = false;
385
386 for line in lines {
387 let line = line.trim();
388
389 if line.starts_with("[package]") {
390 in_package = true;
391 in_dependencies = false;
392 continue;
393 }
394
395 if line.starts_with("[dependencies") {
396 in_package = false;
397 in_dependencies = true;
398 continue;
399 }
400
401 if line.starts_with("[") {
402 in_package = false;
403 in_dependencies = false;
404 continue;
405 }
406
407 if in_package {
408 if line.starts_with("name") {
409 if let Some(name) = line.split('=').nth(1) {
410 metadata.name = name.trim().trim_matches('"').to_string();
411 }
412 } else if line.starts_with("version") {
413 if let Some(version) = line.split('=').nth(1) {
414 metadata.rust_version = Some(version.trim().trim_matches('"').to_string());
415 }
416 }
417 }
418
419 if in_dependencies && !line.is_empty() {
420 if let Some(dep_name) = line.split('=').next() {
421 metadata.dependencies.push(dep_name.trim().to_string());
422 }
423 }
424 }
425
426 Ok(())
427 }
428
429 fn extract_description_from_readme(&self, content: &str) -> Option<String> {
430 let lines: Vec<&str> = content.lines().collect();
431 let mut description = String::new();
432
433 for line in lines.iter().take(10) {
434 if line.starts_with('#') {
435 continue;
436 }
437
438 if !line.trim().is_empty() {
439 description.push_str(line);
440 description.push('\n');
441
442 if description.len() > 200 {
443 break;
444 }
445 }
446 }
447
448 if description.trim().is_empty() {
449 None
450 } else {
451 Some(description.trim().to_string())
452 }
453 }
454}