codegraph_python/
parser.rs

1use crate::config::ParserConfig;
2use std::collections::HashMap;
3use std::path::PathBuf;
4use std::time::Duration;
5use tracing::{debug, info, instrument, warn};
6
7/// Information about a parsed file
8#[derive(Debug, Clone)]
9pub struct FileInfo {
10    /// Path to the parsed file
11    pub file_path: PathBuf,
12
13    /// Function entity IDs
14    pub functions: Vec<String>,
15
16    /// Class entity IDs
17    pub classes: Vec<String>,
18
19    /// Module entity ID
20    pub modules: Vec<String>,
21
22    /// Trait entity IDs
23    pub traits: Vec<String>,
24
25    /// Number of lines in the file
26    pub lines: usize,
27
28    /// Time taken to parse
29    pub parse_time: Duration,
30}
31
32impl FileInfo {
33    /// Create a new FileInfo
34    pub fn new(file_path: PathBuf) -> Self {
35        Self {
36            file_path,
37            functions: Vec::new(),
38            classes: Vec::new(),
39            modules: Vec::new(),
40            traits: Vec::new(),
41            lines: 0,
42            parse_time: Duration::from_secs(0),
43        }
44    }
45
46    /// Get total entity count
47    pub fn entity_count(&self) -> usize {
48        self.functions.len() + self.classes.len() + self.modules.len() + self.traits.len()
49    }
50}
51
52/// Information about a parsed project
53#[derive(Debug, Clone)]
54pub struct ProjectInfo {
55    /// All successfully parsed files
56    pub files: Vec<FileInfo>,
57
58    /// Failed files with error messages
59    pub failed_files: HashMap<PathBuf, String>,
60
61    /// Total number of functions across all files
62    pub total_functions: usize,
63
64    /// Total number of classes across all files
65    pub total_classes: usize,
66
67    /// Total number of traits across all files
68    pub total_traits: usize,
69
70    /// Total number of lines across all files
71    pub total_lines: usize,
72
73    /// Total time taken to parse entire project
74    pub total_time: Duration,
75}
76
77impl ProjectInfo {
78    /// Create a new ProjectInfo
79    pub fn new() -> Self {
80        Self {
81            files: Vec::new(),
82            failed_files: HashMap::new(),
83            total_functions: 0,
84            total_classes: 0,
85            total_traits: 0,
86            total_lines: 0,
87            total_time: Duration::from_secs(0),
88        }
89    }
90
91    /// Calculate success rate as percentage
92    pub fn success_rate(&self) -> f64 {
93        let total = self.files.len() + self.failed_files.len();
94        if total == 0 {
95            return 100.0;
96        }
97        (self.files.len() as f64 / total as f64) * 100.0
98    }
99
100    /// Calculate average parse time per file
101    pub fn avg_parse_time(&self) -> Duration {
102        if self.files.is_empty() {
103            return Duration::from_secs(0);
104        }
105        self.total_time / self.files.len() as u32
106    }
107
108    /// Add a successfully parsed file
109    pub fn add_file(&mut self, file_info: FileInfo) {
110        self.total_functions += file_info.functions.len();
111        self.total_classes += file_info.classes.len();
112        self.total_traits += file_info.traits.len();
113        self.total_lines += file_info.lines;
114        self.total_time += file_info.parse_time;
115        self.files.push(file_info);
116    }
117
118    /// Add a failed file
119    pub fn add_failure(&mut self, path: PathBuf, error: String) {
120        self.failed_files.insert(path, error);
121    }
122}
123
124impl Default for ProjectInfo {
125    fn default() -> Self {
126        Self::new()
127    }
128}
129
130/// Main parser for Python source code
131pub struct Parser {
132    config: ParserConfig,
133}
134
135impl Parser {
136    /// Create a new parser with default configuration
137    pub fn new() -> Self {
138        Self {
139            config: ParserConfig::default(),
140        }
141    }
142
143    /// Create a parser with custom configuration
144    pub fn with_config(config: ParserConfig) -> Self {
145        Self { config }
146    }
147
148    /// Get the parser configuration
149    pub fn config(&self) -> &ParserConfig {
150        &self.config
151    }
152
153    /// Parse Python source code from a string
154    ///
155    /// # Arguments
156    ///
157    /// * `source` - Python source code as a string
158    /// * `file_path` - Path to the source file (for error reporting)
159    /// * `graph` - Mutable reference to the code graph
160    ///
161    /// # Returns
162    ///
163    /// A `FileInfo` with information about the parsed entities
164    pub fn parse_source(
165        &self,
166        source: &str,
167        file_path: &std::path::Path,
168        graph: &mut codegraph::CodeGraph,
169    ) -> crate::error::Result<FileInfo> {
170        use std::time::Instant;
171
172        let start = Instant::now();
173
174        // Extract entities from source code
175        let ir = crate::extractor::extract(source, file_path, &self.config).map_err(|e| {
176            crate::error::ParseError::SyntaxError {
177                file: file_path.display().to_string(),
178                line: 0,
179                column: 0,
180                message: e,
181            }
182        })?;
183
184        // Build graph from IR
185        let file_id = crate::builder::build_graph(graph, &ir, file_path.to_str().unwrap_or(""))?;
186
187        // Create FileInfo from IR
188        let mut file_info = FileInfo::new(file_path.to_path_buf());
189
190        // Convert NodeId to String for tracking (include top-level functions and methods)
191        file_info.functions = ir.functions.iter().map(|f| f.name.clone()).collect();
192
193        // Also add methods from classes
194        for class in &ir.classes {
195            for method in &class.methods {
196                file_info
197                    .functions
198                    .push(format!("{}.{}", class.name, method.name));
199            }
200        }
201
202        file_info.classes = ir.classes.iter().map(|c| c.name.clone()).collect();
203        file_info.traits = ir.traits.iter().map(|t| t.name.clone()).collect();
204
205        if let Some(ref module) = ir.module {
206            file_info.modules.push(module.name.clone());
207            file_info.lines = module.line_count;
208        }
209
210        file_info.parse_time = start.elapsed();
211
212        // Store the file_id for later use (could be added to FileInfo if needed)
213        let _ = file_id;
214
215        Ok(file_info)
216    }
217
218    /// Parse a Python file
219    ///
220    /// # Arguments
221    ///
222    /// * `file_path` - Path to the Python file
223    /// * `graph` - Mutable reference to the code graph
224    ///
225    /// # Returns
226    ///
227    /// A `FileInfo` with information about the parsed entities
228    #[instrument(skip(self, graph), fields(file = %file_path.display()))]
229    pub fn parse_file(
230        &self,
231        file_path: &std::path::Path,
232        graph: &mut codegraph::CodeGraph,
233    ) -> crate::error::Result<FileInfo> {
234        use std::fs;
235
236        debug!("Starting file parse");
237
238        // Validate file extension
239        if let Some(ext) = file_path.extension() {
240            if let Some(ext_str) = ext.to_str() {
241                if !self.config.should_parse_extension(ext_str) {
242                    warn!("Invalid file extension: {}", ext_str);
243                    return Err(crate::error::ParseError::InvalidConfig(format!(
244                        "File extension not allowed: {file_path:?}"
245                    )));
246                }
247            }
248        }
249
250        // Check file size
251        let metadata = fs::metadata(file_path).map_err(|e| crate::error::ParseError::IoError {
252            path: file_path.to_path_buf(),
253            source: e,
254        })?;
255
256        if metadata.len() > self.config.max_file_size as u64 {
257            warn!("File too large: {} bytes", metadata.len());
258            return Err(crate::error::ParseError::FileTooLarge {
259                path: file_path.to_path_buf(),
260                max_size: self.config.max_file_size,
261                actual_size: metadata.len() as usize,
262            });
263        }
264
265        // Read file contents
266        let source =
267            fs::read_to_string(file_path).map_err(|e| crate::error::ParseError::IoError {
268                path: file_path.to_path_buf(),
269                source: e,
270            })?;
271
272        // Parse the source
273        let result = self.parse_source(&source, file_path, graph)?;
274
275        info!(
276            functions = result.functions.len(),
277            classes = result.classes.len(),
278            lines = result.lines,
279            time_ms = result.parse_time.as_millis(),
280            "File parsed successfully"
281        );
282
283        Ok(result)
284    }
285
286    /// Parse all Python files in a directory recursively
287    ///
288    /// # Arguments
289    ///
290    /// * `dir_path` - Path to the directory to parse
291    /// * `graph` - Mutable reference to the code graph
292    ///
293    /// # Returns
294    ///
295    /// A `ProjectInfo` with information about all parsed files
296    #[instrument(skip(self, graph), fields(dir = %dir_path.display()))]
297    pub fn parse_directory(
298        &self,
299        dir_path: &std::path::Path,
300        graph: &mut codegraph::CodeGraph,
301    ) -> crate::error::Result<ProjectInfo> {
302        use std::time::Instant;
303        use walkdir::WalkDir;
304
305        let start = Instant::now();
306        let mut project_info = ProjectInfo::new();
307
308        info!("Starting directory parse");
309
310        // Collect all Python files in the directory
311        let mut files_to_parse = Vec::new();
312
313        for entry in WalkDir::new(dir_path)
314            .follow_links(false)
315            .into_iter()
316            .filter_entry(|e| {
317                // Skip excluded directories
318                if e.file_type().is_dir() {
319                    if let Some(name) = e.file_name().to_str() {
320                        return !self.config.should_exclude_dir(name);
321                    }
322                }
323                true
324            })
325        {
326            match entry {
327                Ok(entry) => {
328                    if entry.file_type().is_file() {
329                        if let Some(ext) = entry.path().extension() {
330                            if let Some(ext_str) = ext.to_str() {
331                                if self.config.should_parse_extension(ext_str) {
332                                    files_to_parse.push(entry.path().to_path_buf());
333                                }
334                            }
335                        }
336                    }
337                }
338                Err(e) => {
339                    // Record walkdir errors as failed files
340                    if let Some(path) = e.path() {
341                        project_info.add_failure(path.to_path_buf(), e.to_string());
342                    }
343                }
344            }
345        }
346
347        // Parse files (sequential or parallel based on config)
348        if self.config.parallel {
349            self.parse_files_parallel(&files_to_parse, graph, &mut project_info)?;
350        } else {
351            self.parse_files_sequential(&files_to_parse, graph, &mut project_info);
352        }
353
354        project_info.total_time = start.elapsed();
355
356        info!(
357            files_parsed = project_info.files.len(),
358            files_failed = project_info.failed_files.len(),
359            total_functions = project_info.total_functions,
360            total_classes = project_info.total_classes,
361            total_lines = project_info.total_lines,
362            total_time_ms = project_info.total_time.as_millis(),
363            success_rate = project_info.success_rate(),
364            "Directory parse completed"
365        );
366
367        Ok(project_info)
368    }
369
370    /// Parse files sequentially
371    fn parse_files_sequential(
372        &self,
373        files: &[PathBuf],
374        graph: &mut codegraph::CodeGraph,
375        project_info: &mut ProjectInfo,
376    ) {
377        for file_path in files {
378            match self.parse_file(file_path, graph) {
379                Ok(file_info) => {
380                    project_info.add_file(file_info);
381                }
382                Err(e) => {
383                    project_info.add_failure(file_path.clone(), e.to_string());
384                }
385            }
386        }
387    }
388
389    /// Parse files in parallel
390    fn parse_files_parallel(
391        &self,
392        files: &[PathBuf],
393        graph: &mut codegraph::CodeGraph,
394        project_info: &mut ProjectInfo,
395    ) -> crate::error::Result<()> {
396        use rayon::prelude::*;
397        use std::sync::Mutex;
398
399        let graph_mutex = Mutex::new(graph);
400        let project_info_mutex = Mutex::new(project_info);
401
402        // Configure thread pool if num_threads is specified
403        let pool = if let Some(num_threads) = self.config.num_threads {
404            rayon::ThreadPoolBuilder::new()
405                .num_threads(num_threads)
406                .build()
407                .map_err(|e| {
408                    crate::error::ParseError::InvalidConfig(format!(
409                        "Failed to create thread pool: {e}"
410                    ))
411                })?
412        } else {
413            rayon::ThreadPoolBuilder::new().build().map_err(|e| {
414                crate::error::ParseError::InvalidConfig(format!(
415                    "Failed to create thread pool: {e}"
416                ))
417            })?
418        };
419
420        pool.install(|| {
421            files.par_iter().for_each(|file_path| {
422                // Parse file with a temporary graph, then merge
423                // Note: This is simplified - in production we'd want better synchronization
424                let parse_result = {
425                    let mut graph = graph_mutex.lock().unwrap();
426                    self.parse_file(file_path, &mut graph)
427                };
428
429                let mut project_info = project_info_mutex.lock().unwrap();
430                match parse_result {
431                    Ok(file_info) => {
432                        project_info.add_file(file_info);
433                    }
434                    Err(e) => {
435                        project_info.add_failure(file_path.clone(), e.to_string());
436                    }
437                }
438            });
439        });
440
441        Ok(())
442    }
443}
444
445impl Default for Parser {
446    fn default() -> Self {
447        Self::new()
448    }
449}
450
451#[cfg(test)]
452mod tests {
453    use super::*;
454
455    #[test]
456    fn test_file_info_new() {
457        let info = FileInfo::new(PathBuf::from("test.py"));
458        assert_eq!(info.file_path, PathBuf::from("test.py"));
459        assert_eq!(info.entity_count(), 0);
460    }
461
462    #[test]
463    fn test_project_info_success_rate() {
464        let mut info = ProjectInfo::new();
465        assert_eq!(info.success_rate(), 100.0);
466
467        info.add_file(FileInfo::new(PathBuf::from("file1.py")));
468        info.add_file(FileInfo::new(PathBuf::from("file2.py")));
469        info.add_failure(PathBuf::from("file3.py"), "error".to_string());
470
471        assert_eq!(info.success_rate(), 66.66666666666666);
472    }
473
474    #[test]
475    fn test_parser_new() {
476        let parser = Parser::new();
477        assert!(parser.config().include_private);
478    }
479}