ai_context_gen/
scanner.rs

1//! Repository scanning module for the AI Context Generator.
2//!
3//! This module provides functionality to scan and analyze repository structure,
4//! extracting metadata, file information, and project organization.
5
6use anyhow::Result;
7use serde::{Deserialize, Serialize};
8use std::fs;
9use std::path::{Path, PathBuf};
10use walkdir::WalkDir;
11
12use crate::config::{Config, IGNORED_DIRS, IGNORED_FILES, SUPPORTED_EXTENSIONS};
13
14/// Information about a single file in the repository.
15///
16/// Contains both metadata and content for files that are included in the analysis.
17/// This structure is used to pass file information between scanning and generation phases.
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct FileInfo {
20    /// Absolute path to the file on the filesystem.
21    pub path: PathBuf,
22
23    /// Path relative to the repository root.
24    ///
25    /// This is used for display purposes in the generated context.
26    pub relative_path: PathBuf,
27
28    /// Complete content of the file as a string.
29    ///
30    /// For text files, this contains the entire file content.
31    /// Binary files are not processed and won't appear in scan results.
32    pub content: String,
33
34    /// Type classification of the file based on its extension.
35    pub file_type: FileType,
36
37    /// Size of the file in bytes.
38    pub size: u64,
39}
40
41/// Classification of file types supported by the generator.
42///
43/// Different file types receive different processing and priority levels
44/// during context generation.
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub enum FileType {
47    /// Rust source files (`.rs` extension).
48    ///
49    /// These files receive full AST analysis to extract structural information
50    /// about modules, functions, structs, enums, and implementations.
51    Rust,
52
53    /// Markdown documentation files (`.md` extension).
54    ///
55    /// These files are included as high-priority documentation content.
56    Markdown,
57}
58
59/// Complete result of repository scanning operation.
60///
61/// Contains all information gathered during the scanning phase, including
62/// individual files, project structure, and metadata.
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct ScanResult {
65    /// List of all files that were processed during scanning.
66    ///
67    /// Only files with supported extensions that passed filtering are included.
68    pub files: Vec<FileInfo>,
69
70    /// Structural information about the project organization.
71    pub project_structure: ProjectStructure,
72
73    /// Metadata extracted from project configuration files.
74    pub metadata: ProjectMetadata,
75}
76
77/// Information about the overall structure and organization of the project.
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct ProjectStructure {
80    /// String representation of the project's file tree.
81    ///
82    /// This is formatted as a text-based tree structure suitable for
83    /// inclusion in markdown documentation.
84    pub tree: String,
85
86    /// Total number of files that were processed.
87    pub total_files: usize,
88
89    /// Combined size of all processed files in bytes.
90    pub total_size: u64,
91}
92
93/// Project metadata extracted from configuration files and repository structure.
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct ProjectMetadata {
96    /// Name of the project.
97    ///
98    /// Extracted from `Cargo.toml` if available, otherwise derived from
99    /// the repository directory name.
100    pub name: String,
101
102    /// Project description, if available.
103    ///
104    /// Extracted from `Cargo.toml` description field or README.md content.
105    pub description: Option<String>,
106
107    /// List of main dependencies.
108    ///
109    /// Extracted from the `[dependencies]` section of `Cargo.toml`.
110    pub dependencies: Vec<String>,
111
112    /// Rust version or project version.
113    ///
114    /// Extracted from `Cargo.toml` version field.
115    pub rust_version: Option<String>,
116}
117
118/// Repository scanner that processes project files and structure.
119///
120/// The scanner walks through the repository directory, identifies relevant files,
121/// extracts their content, and gathers project metadata.
122///
123/// # Examples
124///
125/// ```rust
126/// use ai_context_gen::{Config, RepositoryScanner};
127/// use std::path::PathBuf;
128///
129/// # async fn example() -> anyhow::Result<()> {
130/// let config = Config {
131///     repo_path: PathBuf::from("."),
132///     max_tokens: 50000,
133///     output_file: "context.md".to_string(),
134///     include_hidden: false,
135///     include_deps: true,
136/// };
137///
138/// let scanner = RepositoryScanner::new(config);
139/// let scan_result = scanner.scan().await?;
140///
141/// println!("Found {} files", scan_result.files.len());
142/// # Ok(())
143/// # }
144/// ```
145pub struct RepositoryScanner {
146    config: Config,
147}
148
149impl RepositoryScanner {
150    /// Creates a new repository scanner with the given configuration.
151    ///
152    /// # Arguments
153    ///
154    /// * `config` - Configuration specifying scanning behavior and output options
155    ///
156    /// # Examples
157    ///
158    /// ```rust
159    /// use ai_context_gen::{Config, RepositoryScanner};
160    ///
161    /// let config = Config::default();
162    /// let scanner = RepositoryScanner::new(config);
163    /// ```
164    pub fn new(config: Config) -> Self {
165        Self { config }
166    }
167
168    /// Performs a complete scan of the repository.
169    ///
170    /// This method walks through the repository directory structure, processes
171    /// all supported files, extracts project metadata, and builds a comprehensive
172    /// scan result.
173    ///
174    /// # Returns
175    ///
176    /// A `ScanResult` containing all discovered files, project structure, and metadata.
177    ///
178    /// # Errors
179    ///
180    /// Returns an error if:
181    /// - The repository path doesn't exist or isn't accessible
182    /// - File system errors occur during scanning
183    /// - Files can't be read or parsed
184    ///
185    /// # Examples
186    ///
187    /// ```rust
188    /// use ai_context_gen::{Config, RepositoryScanner};
189    ///
190    /// # async fn example() -> anyhow::Result<()> {
191    /// let config = Config::default();
192    /// let scanner = RepositoryScanner::new(config);
193    /// let result = scanner.scan().await?;
194    ///
195    /// println!("Scanned {} files", result.files.len());
196    /// println!("Total size: {} bytes", result.project_structure.total_size);
197    /// # Ok(())
198    /// # }
199    /// ```
200    pub async fn scan(&self) -> Result<ScanResult> {
201        let mut files = Vec::new();
202        let mut total_size = 0u64;
203
204        for entry in WalkDir::new(&self.config.repo_path)
205            .into_iter()
206            .filter_entry(|e| self.should_include_path(e.path()))
207        {
208            let entry = entry?;
209            let path = entry.path();
210
211            if path.is_file() {
212                if let Some(file_info) = self.process_file(path).await? {
213                    total_size += file_info.size;
214                    files.push(file_info);
215                }
216            }
217        }
218
219        let project_structure = self.build_project_structure(&files, total_size)?;
220        let metadata = self.extract_project_metadata().await?;
221
222        Ok(ScanResult {
223            files,
224            project_structure,
225            metadata,
226        })
227    }
228
229    /// Determines whether a path should be included in the scan.
230    ///
231    /// This method applies filtering rules based on the configuration and
232    /// predefined ignore lists to determine if a file or directory should
233    /// be processed.
234    ///
235    /// # Arguments
236    ///
237    /// * `path` - The path to evaluate for inclusion
238    ///
239    /// # Returns
240    ///
241    /// `true` if the path should be included, `false` otherwise
242    fn should_include_path(&self, path: &Path) -> bool {
243        let path_str = path.to_string_lossy();
244
245        // Ignore hidden directories if not configured to include them
246        if !self.config.include_hidden && path_str.contains("/.") {
247            return false;
248        }
249
250        // Ignore specific directories
251        for ignored_dir in IGNORED_DIRS {
252            if path_str.contains(ignored_dir) {
253                return false;
254            }
255        }
256
257        // If it's a file, check if it's supported
258        if path.is_file() {
259            let filename = path.file_name().unwrap_or_default().to_string_lossy();
260
261            // Ignore specific files
262            if IGNORED_FILES.contains(&filename.as_ref()) {
263                return false;
264            }
265
266            // Check extension
267            if let Some(ext) = path.extension() {
268                let ext_str = format!(".{}", ext.to_string_lossy());
269                return SUPPORTED_EXTENSIONS.contains(&ext_str.as_str());
270            }
271
272            return false;
273        }
274
275        true
276    }
277
278    /// Processes a single file and extracts its information.
279    ///
280    /// Reads the file content, determines its type based on extension,
281    /// and creates a `FileInfo` structure with all relevant metadata.
282    ///
283    /// # Arguments
284    ///
285    /// * `path` - Path to the file to process
286    ///
287    /// # Returns
288    ///
289    /// `Some(FileInfo)` if the file was successfully processed and should be included,
290    /// `None` if the file should be skipped
291    ///
292    /// # Errors
293    ///
294    /// Returns an error if the file cannot be read or metadata cannot be accessed
295    async fn process_file(&self, path: &Path) -> Result<Option<FileInfo>> {
296        let content = fs::read_to_string(path)?;
297        let metadata = fs::metadata(path)?;
298
299        let file_type = match path.extension().and_then(|ext| ext.to_str()) {
300            Some("rs") => FileType::Rust,
301            Some("md") => FileType::Markdown,
302            _ => return Ok(None),
303        };
304
305        let relative_path = path
306            .strip_prefix(&self.config.repo_path)
307            .unwrap_or(path)
308            .to_path_buf();
309
310        Ok(Some(FileInfo {
311            path: path.to_path_buf(),
312            relative_path,
313            content,
314            file_type,
315            size: metadata.len(),
316        }))
317    }
318
319    fn build_project_structure(
320        &self,
321        files: &[FileInfo],
322        total_size: u64,
323    ) -> Result<ProjectStructure> {
324        let mut tree = String::new();
325        let mut paths: Vec<_> = files.iter().map(|f| &f.relative_path).collect();
326        paths.sort();
327
328        tree.push_str("```\n");
329        for (i, path) in paths.iter().enumerate() {
330            let depth = path.components().count() - 1;
331            let indent = "│   ".repeat(depth);
332            let connector = if i == paths.len() - 1 {
333                "└── "
334            } else {
335                "├── "
336            };
337
338            tree.push_str(&format!("{}{}{}\n", indent, connector, path.display()));
339        }
340        tree.push_str("```\n");
341
342        Ok(ProjectStructure {
343            tree,
344            total_files: files.len(),
345            total_size,
346        })
347    }
348
349    async fn extract_project_metadata(&self) -> Result<ProjectMetadata> {
350        let cargo_toml_path = self.config.repo_path.join("Cargo.toml");
351        let readme_path = self.config.repo_path.join("README.md");
352
353        let mut metadata = ProjectMetadata {
354            name: self
355                .config
356                .repo_path
357                .file_name()
358                .unwrap_or_default()
359                .to_string_lossy()
360                .to_string(),
361            description: None,
362            dependencies: Vec::new(),
363            rust_version: None,
364        };
365
366        // Extract information from Cargo.toml
367        if cargo_toml_path.exists() {
368            let cargo_content = fs::read_to_string(&cargo_toml_path)?;
369            self.parse_cargo_toml(&cargo_content, &mut metadata)?;
370        }
371
372        // Extract description from README.md
373        if readme_path.exists() {
374            let readme_content = fs::read_to_string(&readme_path)?;
375            metadata.description = self.extract_description_from_readme(&readme_content);
376        }
377
378        Ok(metadata)
379    }
380
381    fn parse_cargo_toml(&self, content: &str, metadata: &mut ProjectMetadata) -> Result<()> {
382        let lines: Vec<&str> = content.lines().collect();
383        let mut in_package = false;
384        let mut in_dependencies = false;
385
386        for line in lines {
387            let line = line.trim();
388
389            if line.starts_with("[package]") {
390                in_package = true;
391                in_dependencies = false;
392                continue;
393            }
394
395            if line.starts_with("[dependencies") {
396                in_package = false;
397                in_dependencies = true;
398                continue;
399            }
400
401            if line.starts_with("[") {
402                in_package = false;
403                in_dependencies = false;
404                continue;
405            }
406
407            if in_package {
408                if line.starts_with("name") {
409                    if let Some(name) = line.split('=').nth(1) {
410                        metadata.name = name.trim().trim_matches('"').to_string();
411                    }
412                } else if line.starts_with("version") {
413                    if let Some(version) = line.split('=').nth(1) {
414                        metadata.rust_version = Some(version.trim().trim_matches('"').to_string());
415                    }
416                }
417            }
418
419            if in_dependencies && !line.is_empty() {
420                if let Some(dep_name) = line.split('=').next() {
421                    metadata.dependencies.push(dep_name.trim().to_string());
422                }
423            }
424        }
425
426        Ok(())
427    }
428
429    fn extract_description_from_readme(&self, content: &str) -> Option<String> {
430        let lines: Vec<&str> = content.lines().collect();
431        let mut description = String::new();
432
433        for line in lines.iter().take(10) {
434            if line.starts_with('#') {
435                continue;
436            }
437
438            if !line.trim().is_empty() {
439                description.push_str(line);
440                description.push('\n');
441
442                if description.len() > 200 {
443                    break;
444                }
445            }
446        }
447
448        if description.trim().is_empty() {
449            None
450        } else {
451            Some(description.trim().to_string())
452        }
453    }
454}