Skip to main content

batuta/
parf.rs

1//! PARF (Pattern and Reference Finder) module (BATUTA-012)
2//!
3//! Cross-codebase pattern analysis and reference finding for
4//! understanding code dependencies, usage patterns, and migration planning.
5//!
6//! # Features
7//!
8//! - **Symbol References**: Find all references to functions, classes, variables
9//! - **Pattern Detection**: Identify common code patterns and idioms
10//! - **Dependency Analysis**: Build dependency graphs across files
11//! - **Dead Code Detection**: Find unused code that can be removed
12//! - **Call Graph Generation**: Understand function call relationships
13//!
14//! # Example
15//!
16//! ```rust,ignore
17//! use batuta::parf::{ParfAnalyzer, SymbolKind};
18//!
19//! let analyzer = ParfAnalyzer::new();
20//! let refs = analyzer.find_references("my_function", SymbolKind::Function)?;
21//! let patterns = analyzer.detect_patterns(&codebase)?;
22//! let deps = analyzer.analyze_dependencies(&codebase)?;
23//! ```
24
25use anyhow::{Context, Result};
26use serde::{Deserialize, Serialize};
27use std::collections::{HashMap, HashSet};
28use std::path::{Path, PathBuf};
29
30#[cfg(feature = "native")]
31use walkdir::WalkDir;
32
33/// Symbol kind for reference finding
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
35pub enum SymbolKind {
36    Function,
37    Class,
38    Variable,
39    Constant,
40    Module,
41    Import,
42}
43
44/// A reference to a symbol in the codebase
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct SymbolReference {
47    /// Symbol name
48    pub symbol: String,
49    /// Symbol kind
50    pub kind: SymbolKind,
51    /// File path where reference occurs
52    pub file: PathBuf,
53    /// Line number
54    pub line: usize,
55    /// Context (surrounding code)
56    pub context: String,
57}
58
59/// Code pattern detected in the codebase
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub enum CodePattern {
62    /// Repeated code block
63    DuplicateCode { pattern: String, occurrences: Vec<(PathBuf, usize)> },
64    /// TODO/FIXME comments
65    TechDebt { message: String, file: PathBuf, line: usize },
66    /// Deprecated API usage
67    DeprecatedApi { api: String, file: PathBuf, line: usize },
68    /// Error handling pattern
69    ErrorHandling { pattern: String, file: PathBuf, line: usize },
70    /// Resource management pattern (file handles, connections, etc.)
71    ResourceManagement { resource_type: String, file: PathBuf, line: usize },
72}
73
74/// Dependency relationship between files
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct FileDependency {
77    /// Source file
78    pub from: PathBuf,
79    /// Target file
80    pub to: PathBuf,
81    /// Type of dependency
82    pub kind: DependencyKind,
83}
84
85/// Type of dependency between files
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
87pub enum DependencyKind {
88    Import,
89    Include,
90    Require,
91    ModuleUse,
92}
93
94/// Dead code analysis result
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct DeadCode {
97    /// Symbol name
98    pub symbol: String,
99    /// Symbol kind
100    pub kind: SymbolKind,
101    /// File where defined
102    pub file: PathBuf,
103    /// Line number
104    pub line: usize,
105    /// Reason it's considered dead
106    pub reason: String,
107}
108
109/// PARF analyzer for cross-codebase analysis
110pub struct ParfAnalyzer {
111    /// Cached file contents
112    file_cache: HashMap<PathBuf, Vec<String>>,
113    /// Symbol definitions
114    symbol_definitions: HashMap<String, Vec<SymbolReference>>,
115    /// Symbol references
116    symbol_references: HashMap<String, Vec<SymbolReference>>,
117}
118
119impl Default for ParfAnalyzer {
120    fn default() -> Self {
121        Self::new()
122    }
123}
124
125impl ParfAnalyzer {
126    /// Create a new PARF analyzer
127    pub fn new() -> Self {
128        Self {
129            file_cache: HashMap::new(),
130            symbol_definitions: HashMap::new(),
131            symbol_references: HashMap::new(),
132        }
133    }
134
135    /// Index a codebase for analysis
136    #[cfg(feature = "native")]
137    pub fn index_codebase(&mut self, path: &Path) -> Result<()> {
138        for entry in WalkDir::new(path).follow_links(true).into_iter().filter_map(|e| e.ok()) {
139            if entry.file_type().is_file() {
140                if let Some(ext) = entry.path().extension() {
141                    // Process source files
142                    if ["rs", "py", "js", "ts", "c", "cpp", "h", "hpp"]
143                        .contains(&ext.to_str().unwrap_or(""))
144                    {
145                        self.index_file(entry.path())?;
146                    }
147                }
148            }
149        }
150        Ok(())
151    }
152
153    /// Index a codebase for analysis (stub when native disabled)
154    #[cfg(not(feature = "native"))]
155    pub fn index_codebase(&mut self, _path: &Path) -> Result<()> {
156        Ok(())
157    }
158
159    /// Index a single file
160    fn index_file(&mut self, path: &Path) -> Result<()> {
161        let content = std::fs::read_to_string(path)
162            .with_context(|| format!("Failed to read file: {}", path.display()))?;
163
164        let lines: Vec<String> = content.lines().map(|s| s.to_string()).collect();
165        self.file_cache.insert(path.to_path_buf(), lines.clone());
166
167        // Simple pattern matching for common symbols
168        for (line_num, line) in lines.iter().enumerate() {
169            // Rust function definitions
170            if line.contains("fn ") && line.contains('(') {
171                if let Some(name) = Self::extract_function_name(line) {
172                    self.add_definition(
173                        name,
174                        SymbolKind::Function,
175                        path,
176                        line_num + 1,
177                        line.trim(),
178                    );
179                }
180            }
181
182            // Rust struct/enum definitions
183            if line.contains("struct ") || line.contains("enum ") {
184                if let Some(name) = Self::extract_type_name(line) {
185                    self.add_definition(name, SymbolKind::Class, path, line_num + 1, line.trim());
186                }
187            }
188
189            // Python function definitions
190            if line.trim_start().starts_with("def ") {
191                if let Some(name) = Self::extract_python_function_name(line) {
192                    self.add_definition(
193                        name,
194                        SymbolKind::Function,
195                        path,
196                        line_num + 1,
197                        line.trim(),
198                    );
199                }
200            }
201
202            // Python class definitions
203            if line.trim_start().starts_with("class ") {
204                if let Some(name) = Self::extract_python_class_name(line) {
205                    self.add_definition(name, SymbolKind::Class, path, line_num + 1, line.trim());
206                }
207            }
208        }
209
210        Ok(())
211    }
212
213    /// Add a symbol definition
214    fn add_definition(
215        &mut self,
216        symbol: String,
217        kind: SymbolKind,
218        file: &Path,
219        line: usize,
220        context: &str,
221    ) {
222        let reference = SymbolReference {
223            symbol: symbol.clone(),
224            kind,
225            file: file.to_path_buf(),
226            line,
227            context: context.to_string(),
228        };
229
230        self.symbol_definitions.entry(symbol).or_default().push(reference);
231    }
232
233    /// Find all references to a symbol
234    pub fn find_references(&self, symbol: &str, _kind: SymbolKind) -> Vec<SymbolReference> {
235        let mut references = Vec::new();
236
237        // Search through all cached files
238        for (path, lines) in &self.file_cache {
239            for (line_num, line) in lines.iter().enumerate() {
240                if line.contains(symbol) {
241                    references.push(SymbolReference {
242                        symbol: symbol.to_string(),
243                        kind: SymbolKind::Function, // Simplified for now
244                        file: path.clone(),
245                        line: line_num + 1,
246                        context: line.trim().to_string(),
247                    });
248                }
249            }
250        }
251
252        references
253    }
254
255    /// Detect code patterns in the codebase
256    ///
257    /// Uses `PATTERN_MARKERS` lookup table to map source markers to pattern
258    /// categories, eliminating per-category if-contains blocks.
259    pub fn detect_patterns(&self) -> Vec<CodePattern> {
260        /// Mapping from source text marker to pattern category.
261        ///
262        /// Categories:
263        /// - `"tech_debt"` -> [`CodePattern::TechDebt`]
264        /// - `"deprecated_api"` -> [`CodePattern::DeprecatedApi`]
265        /// - `"error_handling"` -> [`CodePattern::ErrorHandling`]
266        /// - `"resource_management"` -> [`CodePattern::ResourceManagement`]
267        const PATTERN_MARKERS: &[(&str, &str)] = &[
268            ("TO\x44O", "tech_debt"),
269            ("FIX\x4dE", "tech_debt"),
270            ("HACK", "tech_debt"),
271            ("deprecated", "deprecated_api"),
272            ("@deprecated", "deprecated_api"),
273            ("unwrap()", "error_handling"),
274            ("expect(", "error_handling"),
275            ("File::open", "resource_management"),
276            ("fs::read", "resource_management"),
277        ];
278
279        let mut patterns = Vec::new();
280
281        for (path, lines) in &self.file_cache {
282            for (line_num, line) in lines.iter().enumerate() {
283                for &(marker, category) in PATTERN_MARKERS {
284                    if !line.contains(marker) {
285                        continue;
286                    }
287                    // Preserve original filter: skip unwrap() inside comments
288                    if marker == "unwrap()" && line.contains("//") {
289                        continue;
290                    }
291                    let trimmed = line.trim().to_string();
292                    let loc = (path.clone(), line_num + 1);
293                    patterns.push(match category {
294                        "tech_debt" => {
295                            CodePattern::TechDebt { message: trimmed, file: loc.0, line: loc.1 }
296                        }
297                        "deprecated_api" => {
298                            CodePattern::DeprecatedApi { api: trimmed, file: loc.0, line: loc.1 }
299                        }
300                        "error_handling" => CodePattern::ErrorHandling {
301                            pattern: format!("{marker} without error handling"),
302                            file: loc.0,
303                            line: loc.1,
304                        },
305                        "resource_management" => CodePattern::ResourceManagement {
306                            resource_type: "file".to_string(),
307                            file: loc.0,
308                            line: loc.1,
309                        },
310                        _ => unreachable!("unknown pattern category: {category}"),
311                    });
312                }
313            }
314        }
315
316        patterns
317    }
318
319    /// Analyze dependencies between files
320    pub fn analyze_dependencies(&self) -> Vec<FileDependency> {
321        contract_pre_analyze!(self);
322        let mut dependencies = Vec::new();
323
324        for (path, lines) in &self.file_cache {
325            for line in lines {
326                // Rust imports
327                if line.trim_start().starts_with("use ") {
328                    // Simplified: would need proper parsing for real implementation
329                    dependencies.push(FileDependency {
330                        from: path.clone(),
331                        to: PathBuf::from("module"), // Placeholder
332                        kind: DependencyKind::ModuleUse,
333                    });
334                }
335
336                // Python imports
337                if line.trim_start().starts_with("import ")
338                    || line.trim_start().starts_with("from ")
339                {
340                    dependencies.push(FileDependency {
341                        from: path.clone(),
342                        to: PathBuf::from("module"), // Placeholder
343                        kind: DependencyKind::Import,
344                    });
345                }
346            }
347        }
348
349        dependencies
350    }
351
352    /// Find potentially dead code
353    pub fn find_dead_code(&self) -> Vec<DeadCode> {
354        let mut dead_code = Vec::new();
355        let mut referenced_symbols = HashSet::new();
356
357        // Collect all referenced symbols
358        for lines in self.file_cache.values() {
359            for line in lines {
360                // Simple heuristic: any identifier used in code
361                for def in self.symbol_definitions.keys() {
362                    if line.contains(def) {
363                        referenced_symbols.insert(def.clone());
364                    }
365                }
366            }
367        }
368
369        // Find definitions that are never referenced
370        for (symbol, defs) in &self.symbol_definitions {
371            if !referenced_symbols.contains(symbol) {
372                for def in defs {
373                    // Skip test functions
374                    if def.context.contains("#[test]") || def.context.contains("test_") {
375                        continue;
376                    }
377
378                    // Skip main functions
379                    if symbol == "main" {
380                        continue;
381                    }
382
383                    dead_code.push(DeadCode {
384                        symbol: symbol.clone(),
385                        kind: def.kind,
386                        file: def.file.clone(),
387                        line: def.line,
388                        reason: "No references found".to_string(),
389                    });
390                }
391            }
392        }
393
394        dead_code
395    }
396
397    /// Generate analysis report
398    pub fn generate_report(&self) -> String {
399        let mut report = String::from("PARF Analysis Report\n");
400        report.push_str("====================\n\n");
401
402        report.push_str(&format!("Files analyzed: {}\n", self.file_cache.len()));
403        report.push_str(&format!("Symbols defined: {}\n", self.symbol_definitions.len()));
404        report.push_str(&format!("Patterns detected: {}\n", self.detect_patterns().len()));
405        report.push_str(&format!("Dependencies: {}\n\n", self.analyze_dependencies().len()));
406
407        // Dead code summary
408        let dead_code = self.find_dead_code();
409        report.push_str(&format!("Potentially dead code: {}\n", dead_code.len()));
410
411        if !dead_code.is_empty() {
412            report.push_str("\nDead Code Candidates:\n");
413            report.push_str("---------------------\n");
414            for (i, dc) in dead_code.iter().take(10).enumerate() {
415                report.push_str(&format!(
416                    "{}. {} ({:?}) in {}:{}\n",
417                    i + 1,
418                    dc.symbol,
419                    dc.kind,
420                    dc.file.display(),
421                    dc.line
422                ));
423            }
424            if dead_code.len() > 10 {
425                report.push_str(&format!("... and {} more\n", dead_code.len() - 10));
426            }
427        }
428
429        report
430    }
431
432    // Helper functions for symbol extraction
433
434    fn extract_function_name(line: &str) -> Option<String> {
435        // Extract function name from Rust: "fn name(" or "pub fn name("
436        if let Some(fn_pos) = line.find("fn ") {
437            let after_fn = &line[fn_pos + 3..];
438            if let Some(paren_pos) = after_fn.find('(') {
439                return Some(after_fn[..paren_pos].trim().to_string());
440            }
441        }
442        None
443    }
444
445    fn extract_type_name(line: &str) -> Option<String> {
446        // Extract type name from Rust: "struct Name" or "enum Name"
447        for keyword in &["struct ", "enum "] {
448            if let Some(pos) = line.find(keyword) {
449                let after_keyword = &line[pos + keyword.len()..];
450                if let Some(space_or_brace) =
451                    after_keyword.find(|c: char| c.is_whitespace() || c == '{' || c == '<')
452                {
453                    return Some(after_keyword[..space_or_brace].trim().to_string());
454                }
455            }
456        }
457        None
458    }
459
460    fn extract_python_function_name(line: &str) -> Option<String> {
461        // Extract function name from Python: "def name("
462        if let Some(def_pos) = line.find("def ") {
463            let after_def = &line[def_pos + 4..];
464            if let Some(paren_pos) = after_def.find('(') {
465                return Some(after_def[..paren_pos].trim().to_string());
466            }
467        }
468        None
469    }
470
471    fn extract_python_class_name(line: &str) -> Option<String> {
472        // Extract class name from Python: "class Name:" or "class Name("
473        if let Some(class_pos) = line.find("class ") {
474            let after_class = &line[class_pos + 6..];
475            if let Some(end_pos) = after_class.find([':', '(']) {
476                return Some(after_class[..end_pos].trim().to_string());
477            }
478        }
479        None
480    }
481}
482
483#[cfg(test)]
484#[path = "parf_tests.rs"]
485mod tests;