tree_parser/
parser.rs

1//! Core parsing functionality
2
3use crate::{
4    languages::*, CodeConstruct, ConstructMetadata, Error, ErrorType, FileError, Language,
5    LanguageDetection, ParseOptions, ParsedFile, ParsedProject,
6};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9
10use tokio::fs;
11use tree_sitter::{Node, Parser, Tree};
12use walkdir::WalkDir;
13
14/// Parse a single source code file and extract code constructs
15/// 
16/// This function reads a source code file, parses it using tree-sitter,
17/// and extracts all identifiable code constructs (functions, classes, etc.).
18/// 
19/// # Arguments
20/// 
21/// * `file_path` - Path to the source code file to parse
22/// * `language` - The programming language of the file
23/// 
24/// # Returns
25/// 
26/// Returns a `ParsedFile` containing all extracted constructs and metadata,
27/// or an `Error` if parsing fails.
28/// 
29/// # Examples
30/// 
31/// ```rust
32/// use tree_parser::{parse_file, Language};
33/// 
34/// #[tokio::main]
35/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
36///     let result = parse_file("src/main.rs", Language::Rust).await?;
37///     println!("Found {} constructs", result.constructs.len());
38///     Ok(())
39/// }
40/// ```
41/// 
42/// # Errors
43/// 
44/// This function will return an error if:
45/// - The file cannot be read (I/O error)
46/// - The file content cannot be parsed (syntax error)
47/// - The specified language is not supported
48pub async fn parse_file(file_path: &str, language: Language) -> Result<ParsedFile, Error> {
49    // Read file content
50    let content = fs::read_to_string(file_path)
51        .await
52        .map_err(|e| Error::Io(e.to_string()))?;
53    
54    let file_size_bytes = content.len();
55    
56    // Get tree-sitter language
57    let ts_language = get_tree_sitter_language(&language)?;
58    
59    // Create parser
60    let mut parser = Parser::new();
61    parser
62        .set_language(&ts_language)
63        .map_err(|e| Error::Parse(e.to_string()))?;
64    
65    // Parse the content
66    let tree = parser
67        .parse(&content, None)
68        .ok_or_else(|| Error::Parse("Failed to parse file".to_string()))?;
69    
70    // Extract code constructs
71    let constructs = extract_constructs(&tree, &content, &language);
72    
73    let path = Path::new(file_path);
74    let relative_path = path
75        .file_name()
76        .unwrap_or_default()
77        .to_string_lossy()
78        .to_string();
79    
80    Ok(ParsedFile {
81        file_path: file_path.to_string(),
82        relative_path,
83        language,
84        constructs,
85        syntax_tree: Some(tree),
86        file_size_bytes,
87
88    })
89}
90
91/// Parse an entire project directory recursively
92/// 
93/// This function traverses a directory structure, identifies source code files,
94/// and parses them concurrently to extract code constructs from all supported files.
95/// 
96/// # Arguments
97/// 
98/// * `dir_path` - Path to the root directory to parse
99/// * `options` - Configuration options controlling parsing behavior
100/// 
101/// # Returns
102/// 
103/// Returns a `ParsedProject` containing results from all parsed files,
104/// including statistics and error information.
105/// 
106/// # Examples
107/// 
108/// ```rust
109/// use tree_parser::{parse_directory, ParseOptions};
110/// 
111/// #[tokio::main]
112/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
113///     let options = ParseOptions::default();
114///     let project = parse_directory("./src", options).await?;
115///     
116///     println!("Parsed {} files", project.total_files_processed);
117///     for (language, count) in &project.language_distribution {
118///         println!("{:?}: {} files", language, count);
119///     }
120///     Ok(())
121/// }
122/// ```
123/// 
124/// # Performance
125/// 
126/// This function uses concurrent processing to parse multiple files simultaneously.
127/// The concurrency level is controlled by `options.max_concurrent_files`.
128pub async fn parse_directory(
129    dir_path: &str,
130    options: ParseOptions,
131) -> Result<ParsedProject, Error> {
132    let root_path = PathBuf::from(dir_path);
133    
134    if !root_path.exists() {
135        return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
136    }
137    
138    // Collect files to parse
139    let files_to_parse = collect_files(&root_path, &options)?;
140    
141    // Parse files in parallel
142    let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
143    
144    // Calculate statistics
145    let total_files_processed = parsed_files.len();
146    let mut language_distribution = HashMap::new();
147    for file in &parsed_files {
148        *language_distribution.entry(file.language.clone()).or_insert(0) += 1;
149    }
150    
151    Ok(ParsedProject {
152        root_path: dir_path.to_string(),
153        files: parsed_files,
154        total_files_processed,
155
156        language_distribution,
157        error_files,
158    })
159}
160
161/// Parse a project directory with custom file filtering
162/// 
163/// This function provides advanced filtering capabilities for selecting which files
164/// to parse within a directory structure. It combines the standard parsing options
165/// with custom filtering criteria.
166/// 
167/// # Arguments
168/// 
169/// * `dir_path` - Path to the root directory to parse
170/// * `file_filter` - Custom filter criteria for file selection
171/// * `options` - Configuration options controlling parsing behavior
172/// 
173/// # Returns
174/// 
175/// Returns a `ParsedProject` containing results from all files that match
176/// the filter criteria.
177/// 
178/// # Examples
179/// 
180/// ```rust
181/// use tree_parser::{parse_directory_with_filter, ParseOptions, FileFilter, Language};
182/// use std::sync::Arc;
183/// 
184/// #[tokio::main]
185/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
186///     let filter = FileFilter {
187///         languages: Some(vec![Language::Rust, Language::Python]),
188///         extensions: None,
189///         min_size_bytes: Some(100),
190///         max_size_bytes: Some(100_000),
191///         custom_predicate: Some(Arc::new(|path| {
192///             !path.to_string_lossy().contains("test")
193///         })),
194///     };
195///     
196///     let options = ParseOptions::default();
197///     let project = parse_directory_with_filter("./src", &filter, options).await?;
198///     
199///     println!("Parsed {} filtered files", project.total_files_processed);
200///     Ok(())
201/// }
202/// ```
203pub async fn parse_directory_with_filter(
204    dir_path: &str,
205    file_filter: &crate::FileFilter,
206    options: ParseOptions,
207) -> Result<ParsedProject, Error> {
208    let root_path = PathBuf::from(dir_path);
209    
210    if !root_path.exists() {
211        return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
212    }
213    
214    // Collect files to parse with custom filter
215    let files_to_parse = collect_files_with_filter(&root_path, &options, file_filter)?;
216    
217    // Parse files in parallel
218    let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
219    
220    // Calculate statistics
221    let total_files_processed = parsed_files.len();
222    let mut language_distribution = HashMap::new();
223    for file in &parsed_files {
224        *language_distribution.entry(file.language.clone()).or_insert(0) += 1;
225    }
226    
227    Ok(ParsedProject {
228        root_path: dir_path.to_string(),
229        files: parsed_files,
230        total_files_processed,
231
232        language_distribution,
233        error_files,
234    })
235}
236
237/// Collect files to parse from directory based on parsing options
238/// 
239/// This internal function traverses a directory structure and collects all files
240/// that should be parsed according to the provided options.
241/// 
242/// # Arguments
243/// 
244/// * `root_path` - Root directory path to start collection from
245/// * `options` - Parsing options that control file selection
246/// 
247/// # Returns
248/// 
249/// A vector of file paths that should be parsed, or an error if directory
250/// traversal fails.
251fn collect_files(root_path: &Path, options: &ParseOptions) -> Result<Vec<PathBuf>, Error> {
252    let mut files = Vec::new();
253    
254    let walker = if options.recursive {
255        WalkDir::new(root_path)
256    } else {
257        WalkDir::new(root_path).max_depth(1)
258    };
259    
260    for entry in walker {
261        let entry = entry.map_err(|e| Error::Io(e.to_string()))?;
262        let path = entry.path();
263        
264        // Skip directories
265        if path.is_dir() {
266            continue;
267        }
268        
269        // Skip hidden files if not included
270        if !options.include_hidden_files && is_hidden_file(path) {
271            continue;
272        }
273        
274        // Check ignore patterns
275        if should_ignore_file(path, &options.ignore_patterns) {
276            continue;
277        }
278        
279        // Check file size
280        if let Ok(metadata) = path.metadata() {
281            let size_mb = metadata.len() as usize / (1024 * 1024);
282            if size_mb > options.max_file_size_mb {
283                continue;
284            }
285        }
286        
287        // Check if we can detect the language
288        if detect_language_by_extension(&path.to_string_lossy()).is_some() {
289            files.push(path.to_path_buf());
290        }
291    }
292    
293    Ok(files)
294}
295
296/// Collect files with custom filter criteria
297/// 
298/// This internal function extends the basic file collection with additional
299/// filtering capabilities provided by a `FileFilter`.
300/// 
301/// # Arguments
302/// 
303/// * `root_path` - Root directory path to start collection from
304/// * `options` - Parsing options that control file selection
305/// * `filter` - Custom filter criteria for more precise file selection
306/// 
307/// # Returns
308/// 
309/// A vector of file paths that match both the parsing options and the custom
310/// filter criteria.
311fn collect_files_with_filter(
312    root_path: &Path,
313    options: &ParseOptions,
314    filter: &crate::FileFilter,
315) -> Result<Vec<PathBuf>, Error> {
316    let mut files = collect_files(root_path, options)?;
317    
318    // Apply custom filter
319    files.retain(|path| {
320        // Check extensions
321        if let Some(ref extensions) = filter.extensions {
322            if let Some(ext) = path.extension() {
323                if !extensions.contains(&ext.to_string_lossy().to_lowercase()) {
324                    return false;
325                }
326            } else {
327                return false;
328            }
329        }
330        
331        // Check languages
332        if let Some(ref languages) = filter.languages {
333            if let Some(detected_lang) = detect_language_by_extension(&path.to_string_lossy()) {
334                if !languages.contains(&detected_lang) {
335                    return false;
336                }
337            } else {
338                return false;
339            }
340        }
341        
342        // Check file size
343        if let Ok(metadata) = path.metadata() {
344            let size = metadata.len() as usize;
345            
346            if let Some(min_size) = filter.min_size_bytes {
347                if size < min_size {
348                    return false;
349                }
350            }
351            
352            if let Some(max_size) = filter.max_size_bytes {
353                if size > max_size {
354                    return false;
355                }
356            }
357        }
358        
359        // Apply custom predicate
360        if let Some(ref predicate) = filter.custom_predicate {
361            if !predicate(path) {
362                return false;
363            }
364        }
365        
366        true
367    });
368    
369    Ok(files)
370}
371
372/// Parse files in parallel
373async fn parse_files_parallel(
374    files: Vec<PathBuf>,
375    options: &ParseOptions,
376) -> (Vec<ParsedFile>, Vec<FileError>) {
377    let chunk_size = std::cmp::max(1, files.len() / options.max_concurrent_files);
378    let mut parsed_files = Vec::new();
379    let mut error_files = Vec::new();
380    
381    for chunk in files.chunks(chunk_size) {
382        let chunk_results: Vec<_> = chunk
383            .iter()
384            .map(|path| async move {
385                let path_str = path.to_string_lossy().to_string();
386                
387                // Detect language
388                let language = match options.language_detection {
389                    LanguageDetection::ByExtension => detect_language_by_extension(&path_str),
390                    LanguageDetection::Combined => {
391                        // Try to read content for better detection
392                        if let Ok(content) = tokio::fs::read_to_string(path).await {
393                            detect_language(&path_str, Some(&content))
394                        } else {
395                            detect_language_by_extension(&path_str)
396                        }
397                    }
398                    _ => detect_language_by_extension(&path_str), // Fallback
399                };
400                
401                if let Some(lang) = language {
402                    match parse_file(&path_str, lang).await {
403                        Ok(parsed) => Ok(parsed),
404                        Err(e) => Err(FileError {
405                            file_path: path_str,
406                            error_type: ErrorType::ParseError,
407                            message: e.to_string(),
408                        }),
409                    }
410                } else {
411                    Err(FileError {
412                        file_path: path_str,
413                        error_type: ErrorType::UnsupportedLanguage,
414                        message: "Could not detect language".to_string(),
415                    })
416                }
417            })
418            .collect();
419        
420        // Await all tasks in this chunk
421        for result in futures::future::join_all(chunk_results).await {
422            match result {
423                Ok(parsed_file) => parsed_files.push(parsed_file),
424                Err(error) => error_files.push(error),
425            }
426        }
427    }
428    
429    (parsed_files, error_files)
430}
431
432/// Extract code constructs from syntax tree
433fn extract_constructs(tree: &Tree, source: &str, language: &Language) -> Vec<CodeConstruct> {
434    let mut constructs = Vec::new();
435    let root_node = tree.root_node();
436    
437    extract_constructs_recursive(root_node, source, language, &mut constructs, None);
438    
439    constructs
440}
441
442/// Recursively extract constructs from nodes
443fn extract_constructs_recursive(
444    node: Node,
445    source: &str,
446    language: &Language,
447    constructs: &mut Vec<CodeConstruct>,
448    parent: Option<&CodeConstruct>,
449) {
450    let node_type = node.kind();
451    let supported_types = get_supported_node_types(language);
452    
453    if supported_types.contains(&node_type.to_string()) {
454        let construct = create_code_construct(node, source, language);
455        constructs.push(construct);
456    }
457    
458    // Recursively process children
459    for i in 0..node.child_count() {
460        if let Some(child) = node.child(i) {
461            extract_constructs_recursive(child, source, language, constructs, parent);
462        }
463    }
464}
465
466/// Create a CodeConstruct from a tree-sitter node
467fn create_code_construct(node: Node, source: &str, language: &Language) -> CodeConstruct {
468    let start_byte = node.start_byte();
469    let end_byte = node.end_byte();
470    let source_code = source[start_byte..end_byte].to_string();
471    
472    let start_point = node.start_position();
473    let end_point = node.end_position();
474    
475    // Extract name if possible
476    let name = extract_construct_name(node, source);
477    
478    // Create metadata
479    let metadata = extract_metadata(node, source, language);
480    
481    CodeConstruct {
482        node_type: node.kind().to_string(),
483        name,
484        source_code,
485        start_line: start_point.row + 1, // Convert to 1-based
486        end_line: end_point.row + 1,
487        start_byte,
488        end_byte,
489        parent: None, // Will be set later if needed
490        children: Vec::new(), // Will be populated later if needed
491        metadata,
492    }
493}
494
495/// Extract construct name from node
496fn extract_construct_name(node: Node, source: &str) -> Option<String> {
497    // Try to find identifier child
498    for i in 0..node.child_count() {
499        if let Some(child) = node.child(i) {
500            if child.kind() == "identifier" || child.kind() == "name" {
501                let start = child.start_byte();
502                let end = child.end_byte();
503                return Some(source[start..end].to_string());
504            }
505        }
506    }
507    None
508}
509
510/// Extract metadata from node
511fn extract_metadata(_node: Node, _source: &str, _language: &Language) -> ConstructMetadata {
512    ConstructMetadata {
513        visibility: None,
514        modifiers: Vec::new(),
515        parameters: Vec::new(),
516        return_type: None,
517        inheritance: Vec::new(),
518        annotations: Vec::new(),
519        documentation: None,
520    }
521}
522
523/// Check if file is hidden
524fn is_hidden_file(path: &Path) -> bool {
525    path.file_name()
526        .and_then(|name| name.to_str())
527        .map(|name| name.starts_with('.'))
528        .unwrap_or(false)
529}
530
531/// Check if file should be ignored based on patterns
532fn should_ignore_file(path: &Path, ignore_patterns: &[String]) -> bool {
533    let path_str = path.to_string_lossy();
534    
535    for pattern in ignore_patterns {
536        if path_str.contains(pattern) {
537            return true;
538        }
539    }
540    
541    false
542}