tree_parser/
parser.rs

1//! Core parsing functionality
2
3use crate::{
4    languages::*, CodeConstruct, ConstructMetadata, Error, ErrorType, FileError, Language,
5    LanguageDetection, ParseOptions, ParsedFile, ParsedProject,
6};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::time::Instant;
10use tokio::fs;
11use tree_sitter::{Node, Parser, Tree};
12use walkdir::WalkDir;
13
14/// Parse a single source code file and extract code constructs
15/// 
16/// This function reads a source code file, parses it using tree-sitter,
17/// and extracts all identifiable code constructs (functions, classes, etc.).
18/// 
19/// # Arguments
20/// 
21/// * `file_path` - Path to the source code file to parse
22/// * `language` - The programming language of the file
23/// 
24/// # Returns
25/// 
26/// Returns a `ParsedFile` containing all extracted constructs and metadata,
27/// or an `Error` if parsing fails.
28/// 
29/// # Examples
30/// 
31/// ```rust
32/// use tree_parser::{parse_file, Language};
33/// 
34/// #[tokio::main]
35/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
36///     let result = parse_file("src/main.rs", Language::Rust).await?;
37///     println!("Found {} constructs", result.constructs.len());
38///     Ok(())
39/// }
40/// ```
41/// 
42/// # Errors
43/// 
44/// This function will return an error if:
45/// - The file cannot be read (I/O error)
46/// - The file content cannot be parsed (syntax error)
47/// - The specified language is not supported
48pub async fn parse_file(file_path: &str, language: Language) -> Result<ParsedFile, Error> {
49    let start_time = Instant::now();
50    
51    // Read file content
52    let content = fs::read_to_string(file_path)
53        .await
54        .map_err(|e| Error::Io(e.to_string()))?;
55    
56    let file_size_bytes = content.len();
57    
58    // Get tree-sitter language
59    let ts_language = get_tree_sitter_language(&language)?;
60    
61    // Create parser
62    let mut parser = Parser::new();
63    parser
64        .set_language(&ts_language)
65        .map_err(|e| Error::Parse(e.to_string()))?;
66    
67    // Parse the content
68    let tree = parser
69        .parse(&content, None)
70        .ok_or_else(|| Error::Parse("Failed to parse file".to_string()))?;
71    
72    // Extract code constructs
73    let constructs = extract_constructs(&tree, &content, &language);
74    
75    let parse_time_ms = start_time.elapsed().as_millis() as u64;
76    
77    let path = Path::new(file_path);
78    let relative_path = path
79        .file_name()
80        .unwrap_or_default()
81        .to_string_lossy()
82        .to_string();
83    
84    Ok(ParsedFile {
85        file_path: file_path.to_string(),
86        relative_path,
87        language,
88        constructs,
89        syntax_tree: Some(tree),
90        file_size_bytes,
91        parse_time_ms,
92    })
93}
94
95/// Parse an entire project directory recursively
96/// 
97/// This function traverses a directory structure, identifies source code files,
98/// and parses them concurrently to extract code constructs from all supported files.
99/// 
100/// # Arguments
101/// 
102/// * `dir_path` - Path to the root directory to parse
103/// * `options` - Configuration options controlling parsing behavior
104/// 
105/// # Returns
106/// 
107/// Returns a `ParsedProject` containing results from all parsed files,
108/// including statistics and error information.
109/// 
110/// # Examples
111/// 
112/// ```rust
113/// use tree_parser::{parse_directory, ParseOptions};
114/// 
115/// #[tokio::main]
116/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
117///     let options = ParseOptions::default();
118///     let project = parse_directory("./src", options).await?;
119///     
120///     println!("Parsed {} files", project.total_files_processed);
121///     for (language, count) in &project.language_distribution {
122///         println!("{:?}: {} files", language, count);
123///     }
124///     Ok(())
125/// }
126/// ```
127/// 
128/// # Performance
129/// 
130/// This function uses concurrent processing to parse multiple files simultaneously.
131/// The concurrency level is controlled by `options.max_concurrent_files`.
132pub async fn parse_directory(
133    dir_path: &str,
134    options: ParseOptions,
135) -> Result<ParsedProject, Error> {
136    let start_time = Instant::now();
137    let root_path = PathBuf::from(dir_path);
138    
139    if !root_path.exists() {
140        return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
141    }
142    
143    // Collect files to parse
144    let files_to_parse = collect_files(&root_path, &options)?;
145    
146    // Parse files in parallel
147    let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
148    
149    // Calculate statistics
150    let total_files_processed = parsed_files.len();
151    let processing_time_ms = start_time.elapsed().as_millis() as u64;
152    
153    let mut language_distribution = HashMap::new();
154    for file in &parsed_files {
155        *language_distribution.entry(file.language.clone()).or_insert(0) += 1;
156    }
157    
158    Ok(ParsedProject {
159        root_path: dir_path.to_string(),
160        files: parsed_files,
161        total_files_processed,
162        processing_time_ms,
163        language_distribution,
164        error_files,
165    })
166}
167
168/// Parse a project directory with custom file filtering
169/// 
170/// This function provides advanced filtering capabilities for selecting which files
171/// to parse within a directory structure. It combines the standard parsing options
172/// with custom filtering criteria.
173/// 
174/// # Arguments
175/// 
176/// * `dir_path` - Path to the root directory to parse
177/// * `file_filter` - Custom filter criteria for file selection
178/// * `options` - Configuration options controlling parsing behavior
179/// 
180/// # Returns
181/// 
182/// Returns a `ParsedProject` containing results from all files that match
183/// the filter criteria.
184/// 
185/// # Examples
186/// 
187/// ```rust
188/// use tree_parser::{parse_directory_with_filter, ParseOptions, FileFilter, Language};
189/// use std::sync::Arc;
190/// 
191/// #[tokio::main]
192/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
193///     let filter = FileFilter {
194///         languages: Some(vec![Language::Rust, Language::Python]),
195///         extensions: None,
196///         min_size_bytes: Some(100),
197///         max_size_bytes: Some(100_000),
198///         custom_predicate: Some(Arc::new(|path| {
199///             !path.to_string_lossy().contains("test")
200///         })),
201///     };
202///     
203///     let options = ParseOptions::default();
204///     let project = parse_directory_with_filter("./src", &filter, options).await?;
205///     
206///     println!("Parsed {} filtered files", project.total_files_processed);
207///     Ok(())
208/// }
209/// ```
210pub async fn parse_directory_with_filter(
211    dir_path: &str,
212    file_filter: &crate::FileFilter,
213    options: ParseOptions,
214) -> Result<ParsedProject, Error> {
215    let start_time = Instant::now();
216    let root_path = PathBuf::from(dir_path);
217    
218    if !root_path.exists() {
219        return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
220    }
221    
222    // Collect files to parse with custom filter
223    let files_to_parse = collect_files_with_filter(&root_path, &options, file_filter)?;
224    
225    // Parse files in parallel
226    let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
227    
228    // Calculate statistics
229    let total_files_processed = parsed_files.len();
230    let processing_time_ms = start_time.elapsed().as_millis() as u64;
231    
232    let mut language_distribution = HashMap::new();
233    for file in &parsed_files {
234        *language_distribution.entry(file.language.clone()).or_insert(0) += 1;
235    }
236    
237    Ok(ParsedProject {
238        root_path: dir_path.to_string(),
239        files: parsed_files,
240        total_files_processed,
241        processing_time_ms,
242        language_distribution,
243        error_files,
244    })
245}
246
247/// Collect files to parse from directory based on parsing options
248/// 
249/// This internal function traverses a directory structure and collects all files
250/// that should be parsed according to the provided options.
251/// 
252/// # Arguments
253/// 
254/// * `root_path` - Root directory path to start collection from
255/// * `options` - Parsing options that control file selection
256/// 
257/// # Returns
258/// 
259/// A vector of file paths that should be parsed, or an error if directory
260/// traversal fails.
261fn collect_files(root_path: &Path, options: &ParseOptions) -> Result<Vec<PathBuf>, Error> {
262    let mut files = Vec::new();
263    
264    let walker = if options.recursive {
265        WalkDir::new(root_path)
266    } else {
267        WalkDir::new(root_path).max_depth(1)
268    };
269    
270    for entry in walker {
271        let entry = entry.map_err(|e| Error::Io(e.to_string()))?;
272        let path = entry.path();
273        
274        // Skip directories
275        if path.is_dir() {
276            continue;
277        }
278        
279        // Skip hidden files if not included
280        if !options.include_hidden_files && is_hidden_file(path) {
281            continue;
282        }
283        
284        // Check ignore patterns
285        if should_ignore_file(path, &options.ignore_patterns) {
286            continue;
287        }
288        
289        // Check file size
290        if let Ok(metadata) = path.metadata() {
291            let size_mb = metadata.len() as usize / (1024 * 1024);
292            if size_mb > options.max_file_size_mb {
293                continue;
294            }
295        }
296        
297        // Check if we can detect the language
298        if detect_language_by_extension(&path.to_string_lossy()).is_some() {
299            files.push(path.to_path_buf());
300        }
301    }
302    
303    Ok(files)
304}
305
306/// Collect files with custom filter criteria
307/// 
308/// This internal function extends the basic file collection with additional
309/// filtering capabilities provided by a `FileFilter`.
310/// 
311/// # Arguments
312/// 
313/// * `root_path` - Root directory path to start collection from
314/// * `options` - Parsing options that control file selection
315/// * `filter` - Custom filter criteria for more precise file selection
316/// 
317/// # Returns
318/// 
319/// A vector of file paths that match both the parsing options and the custom
320/// filter criteria.
321fn collect_files_with_filter(
322    root_path: &Path,
323    options: &ParseOptions,
324    filter: &crate::FileFilter,
325) -> Result<Vec<PathBuf>, Error> {
326    let mut files = collect_files(root_path, options)?;
327    
328    // Apply custom filter
329    files.retain(|path| {
330        // Check extensions
331        if let Some(ref extensions) = filter.extensions {
332            if let Some(ext) = path.extension() {
333                if !extensions.contains(&ext.to_string_lossy().to_lowercase()) {
334                    return false;
335                }
336            } else {
337                return false;
338            }
339        }
340        
341        // Check languages
342        if let Some(ref languages) = filter.languages {
343            if let Some(detected_lang) = detect_language_by_extension(&path.to_string_lossy()) {
344                if !languages.contains(&detected_lang) {
345                    return false;
346                }
347            } else {
348                return false;
349            }
350        }
351        
352        // Check file size
353        if let Ok(metadata) = path.metadata() {
354            let size = metadata.len() as usize;
355            
356            if let Some(min_size) = filter.min_size_bytes {
357                if size < min_size {
358                    return false;
359                }
360            }
361            
362            if let Some(max_size) = filter.max_size_bytes {
363                if size > max_size {
364                    return false;
365                }
366            }
367        }
368        
369        // Apply custom predicate
370        if let Some(ref predicate) = filter.custom_predicate {
371            if !predicate(path) {
372                return false;
373            }
374        }
375        
376        true
377    });
378    
379    Ok(files)
380}
381
382/// Parse files in parallel
383async fn parse_files_parallel(
384    files: Vec<PathBuf>,
385    options: &ParseOptions,
386) -> (Vec<ParsedFile>, Vec<FileError>) {
387    let chunk_size = std::cmp::max(1, files.len() / options.max_concurrent_files);
388    let mut parsed_files = Vec::new();
389    let mut error_files = Vec::new();
390    
391    for chunk in files.chunks(chunk_size) {
392        let chunk_results: Vec<_> = chunk
393            .iter()
394            .map(|path| async move {
395                let path_str = path.to_string_lossy().to_string();
396                
397                // Detect language
398                let language = match options.language_detection {
399                    LanguageDetection::ByExtension => detect_language_by_extension(&path_str),
400                    LanguageDetection::Combined => {
401                        // Try to read content for better detection
402                        if let Ok(content) = tokio::fs::read_to_string(path).await {
403                            detect_language(&path_str, Some(&content))
404                        } else {
405                            detect_language_by_extension(&path_str)
406                        }
407                    }
408                    _ => detect_language_by_extension(&path_str), // Fallback
409                };
410                
411                if let Some(lang) = language {
412                    match parse_file(&path_str, lang).await {
413                        Ok(parsed) => Ok(parsed),
414                        Err(e) => Err(FileError {
415                            file_path: path_str,
416                            error_type: ErrorType::ParseError,
417                            message: e.to_string(),
418                        }),
419                    }
420                } else {
421                    Err(FileError {
422                        file_path: path_str,
423                        error_type: ErrorType::UnsupportedLanguage,
424                        message: "Could not detect language".to_string(),
425                    })
426                }
427            })
428            .collect();
429        
430        // Await all tasks in this chunk
431        for result in futures::future::join_all(chunk_results).await {
432            match result {
433                Ok(parsed_file) => parsed_files.push(parsed_file),
434                Err(error) => error_files.push(error),
435            }
436        }
437    }
438    
439    (parsed_files, error_files)
440}
441
442/// Extract code constructs from syntax tree
443fn extract_constructs(tree: &Tree, source: &str, language: &Language) -> Vec<CodeConstruct> {
444    let mut constructs = Vec::new();
445    let root_node = tree.root_node();
446    
447    extract_constructs_recursive(root_node, source, language, &mut constructs, None);
448    
449    constructs
450}
451
452/// Recursively extract constructs from nodes
453fn extract_constructs_recursive(
454    node: Node,
455    source: &str,
456    language: &Language,
457    constructs: &mut Vec<CodeConstruct>,
458    parent: Option<&CodeConstruct>,
459) {
460    let node_type = node.kind();
461    let supported_types = get_supported_node_types(language);
462    
463    if supported_types.contains(&node_type.to_string()) {
464        let construct = create_code_construct(node, source, language);
465        constructs.push(construct);
466    }
467    
468    // Recursively process children
469    for i in 0..node.child_count() {
470        if let Some(child) = node.child(i) {
471            extract_constructs_recursive(child, source, language, constructs, parent);
472        }
473    }
474}
475
476/// Create a CodeConstruct from a tree-sitter node
477fn create_code_construct(node: Node, source: &str, language: &Language) -> CodeConstruct {
478    let start_byte = node.start_byte();
479    let end_byte = node.end_byte();
480    let source_code = source[start_byte..end_byte].to_string();
481    
482    let start_point = node.start_position();
483    let end_point = node.end_position();
484    
485    // Extract name if possible
486    let name = extract_construct_name(node, source);
487    
488    // Create metadata
489    let metadata = extract_metadata(node, source, language);
490    
491    CodeConstruct {
492        node_type: node.kind().to_string(),
493        name,
494        source_code,
495        start_line: start_point.row + 1, // Convert to 1-based
496        end_line: end_point.row + 1,
497        start_byte,
498        end_byte,
499        parent: None, // Will be set later if needed
500        children: Vec::new(), // Will be populated later if needed
501        metadata,
502    }
503}
504
505/// Extract construct name from node
506fn extract_construct_name(node: Node, source: &str) -> Option<String> {
507    // Try to find identifier child
508    for i in 0..node.child_count() {
509        if let Some(child) = node.child(i) {
510            if child.kind() == "identifier" || child.kind() == "name" {
511                let start = child.start_byte();
512                let end = child.end_byte();
513                return Some(source[start..end].to_string());
514            }
515        }
516    }
517    None
518}
519
520/// Extract metadata from node
521fn extract_metadata(_node: Node, _source: &str, _language: &Language) -> ConstructMetadata {
522    ConstructMetadata {
523        visibility: None,
524        modifiers: Vec::new(),
525        parameters: Vec::new(),
526        return_type: None,
527        inheritance: Vec::new(),
528        annotations: Vec::new(),
529        documentation: None,
530    }
531}
532
533/// Check if file is hidden
534fn is_hidden_file(path: &Path) -> bool {
535    path.file_name()
536        .and_then(|name| name.to_str())
537        .map(|name| name.starts_with('.'))
538        .unwrap_or(false)
539}
540
541/// Check if file should be ignored based on patterns
542fn should_ignore_file(path: &Path, ignore_patterns: &[String]) -> bool {
543    let path_str = path.to_string_lossy();
544    
545    for pattern in ignore_patterns {
546        if path_str.contains(pattern) {
547            return true;
548        }
549    }
550    
551    false
552}