tree_parser/
parser.rs

1//! Core parsing functionality
2
3use crate::{
4    languages::*, CodeConstruct, ConstructMetadata, Error, ErrorType, FileError, Language,
5    LanguageDetection, ParseOptions, ParsedFile, ParsedProject,
6};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9
10use tokio::fs;
11use tree_sitter::{Node, Parser, Tree};
12use walkdir::WalkDir;
13
14/// Parse a single source code file and extract code constructs
15/// 
16/// This function reads a source code file, parses it using tree-sitter,
17/// and extracts all identifiable code constructs (functions, classes, etc.).
18/// 
19/// # Arguments
20/// 
21/// * `file_path` - Path to the source code file to parse
22/// * `language` - The programming language of the file
23/// 
24/// # Returns
25/// 
26/// Returns a `ParsedFile` containing all extracted constructs and metadata,
27/// or an `Error` if parsing fails.
28/// 
29/// # Examples
30/// 
31/// ```rust
32/// use tree_parser::{parse_file, Language};
33/// 
34/// #[tokio::main]
35/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
36///     let result = parse_file("src/main.rs", Language::Rust).await?;
37///     println!("Found {} constructs", result.constructs.len());
38///     Ok(())
39/// }
40/// ```
41/// 
42/// # Errors
43/// 
44/// This function will return an error if:
45/// - The file cannot be read (I/O error)
46/// - The file content cannot be parsed (syntax error)
47/// - The specified language is not supported
48pub async fn parse_file(file_path: &str, language: Language) -> Result<ParsedFile, Error> {
49    // Read file content
50    let content = fs::read_to_string(file_path)
51        .await
52        .map_err(|e| Error::Io(e.to_string()))?;
53    
54    let file_size_bytes = content.len();
55    
56    // Get tree-sitter language
57    let ts_language = get_tree_sitter_language(&language)?;
58    
59    // Create parser
60    let mut parser = Parser::new();
61    parser
62        .set_language(&ts_language)
63        .map_err(|e| Error::Parse(e.to_string()))?;
64    
65    // Parse the content
66    let tree = parser
67        .parse(&content, None)
68        .ok_or_else(|| Error::Parse("Failed to parse file".to_string()))?;
69    
70    // Extract code constructs
71    let constructs = extract_constructs(&tree, &content, &language);
72    
73    let path = Path::new(file_path);
74    let relative_path = path
75        .file_name()
76        .unwrap_or_default()
77        .to_string_lossy()
78        .to_string();
79    
80    Ok(ParsedFile {
81        file_path: file_path.to_string(),
82        relative_path,
83        language,
84        constructs,
85        syntax_tree: Some(tree),
86        file_size_bytes,
87
88    })
89}
90
91/// Parse an entire project directory recursively
92/// 
93/// This function traverses a directory structure, identifies source code files,
94/// and parses them concurrently to extract code constructs from all supported files.
95/// 
96/// # Arguments
97/// 
98/// * `dir_path` - Path to the root directory to parse
99/// * `options` - Configuration options controlling parsing behavior
100/// 
101/// # Returns
102/// 
103/// Returns a `ParsedProject` containing results from all parsed files,
104/// including statistics and error information.
105/// 
106/// # Examples
107/// 
108/// ```rust
109/// use tree_parser::{parse_directory, ParseOptions};
110/// 
111/// #[tokio::main]
112/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
113///     let options = ParseOptions::default();
114///     let project = parse_directory("./src", options).await?;
115///     
116///     println!("Parsed {} files", project.total_files_processed);
117///     for (language, count) in &project.language_distribution {
118///         println!("{:?}: {} files", language, count);
119///     }
120///     Ok(())
121/// }
122/// ```
123/// 
124/// # Performance
125/// 
126/// This function uses concurrent processing to parse multiple files simultaneously.
127/// The concurrency level is controlled by `options.max_concurrent_files`.
128pub async fn parse_directory(
129    dir_path: &str,
130    options: ParseOptions,
131) -> Result<ParsedProject, Error> {
132    let root_path = PathBuf::from(dir_path);
133    
134    if !root_path.exists() {
135        return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
136    }
137    
138    // Collect files to parse
139    let files_to_parse = collect_files(&root_path, &options)?;
140    
141    // Parse files in parallel
142    let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
143    
144    // Calculate statistics
145    let total_files_processed = parsed_files.len();
146    let mut language_distribution = HashMap::new();
147    for file in &parsed_files {
148        *language_distribution.entry(file.language.clone()).or_insert(0) += 1;
149    }
150    
151    Ok(ParsedProject {
152        root_path: dir_path.to_string(),
153        files: parsed_files,
154        total_files_processed,
155        language_distribution,
156        error_files,
157    })
158}
159
160/// Parse a project directory with custom file filtering
161/// 
162/// This function provides advanced filtering capabilities for selecting which files
163/// to parse within a directory structure. It combines the standard parsing options
164/// with custom filtering criteria.
165/// 
166/// # Arguments
167/// 
168/// * `dir_path` - Path to the root directory to parse
169/// * `file_filter` - Custom filter criteria for file selection
170/// * `options` - Configuration options controlling parsing behavior
171/// 
172/// # Returns
173/// 
174/// Returns a `ParsedProject` containing results from all files that match
175/// the filter criteria.
176/// 
177/// # Examples
178/// 
179/// ```rust
180/// use tree_parser::{parse_directory_with_filter, ParseOptions, FileFilter, Language};
181/// use std::sync::Arc;
182/// 
183/// #[tokio::main]
184/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
185///     let filter = FileFilter {
186///         languages: Some(vec![Language::Rust, Language::Python]),
187///         extensions: None,
188///         min_size_bytes: Some(100),
189///         max_size_bytes: Some(100_000),
190///         custom_predicate: Some(Arc::new(|path| {
191///             !path.to_string_lossy().contains("test")
192///         })),
193///     };
194///     
195///     let options = ParseOptions::default();
196///     let project = parse_directory_with_filter("./src", &filter, options).await?;
197///     
198///     println!("Parsed {} filtered files", project.total_files_processed);
199///     Ok(())
200/// }
201/// ```
202pub async fn parse_directory_with_filter(
203    dir_path: &str,
204    file_filter: &crate::FileFilter,
205    options: ParseOptions,
206) -> Result<ParsedProject, Error> {
207    let root_path = PathBuf::from(dir_path);
208    
209    if !root_path.exists() {
210        return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
211    }
212    
213    // Collect files to parse with custom filter
214    let files_to_parse = collect_files_with_filter(&root_path, &options, file_filter)?;
215    
216    // Parse files in parallel
217    let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
218    
219    // Calculate statistics
220    let total_files_processed = parsed_files.len();
221    let mut language_distribution = HashMap::new();
222    for file in &parsed_files {
223        *language_distribution.entry(file.language.clone()).or_insert(0) += 1;
224    }
225    
226    Ok(ParsedProject {
227        root_path: dir_path.to_string(),
228        files: parsed_files,
229        total_files_processed,
230
231        language_distribution,
232        error_files,
233    })
234}
235
236/// Collect files to parse from directory based on parsing options
237/// 
238/// This internal function traverses a directory structure and collects all files
239/// that should be parsed according to the provided options.
240/// 
241/// # Arguments
242/// 
243/// * `root_path` - Root directory path to start collection from
244/// * `options` - Parsing options that control file selection
245/// 
246/// # Returns
247/// 
248/// A vector of file paths that should be parsed, or an error if directory
249/// traversal fails.
250fn collect_files(root_path: &Path, options: &ParseOptions) -> Result<Vec<PathBuf>, Error> {
251    let mut files = Vec::new();
252    
253    let walker = if options.recursive {
254        WalkDir::new(root_path)
255    } else {
256        WalkDir::new(root_path).max_depth(1)
257    };
258    
259    for entry in walker {
260        let entry = entry.map_err(|e| Error::Io(e.to_string()))?;
261        let path = entry.path();
262        
263        // Skip directories
264        if path.is_dir() {
265            continue;
266        }
267        
268        // Skip hidden files if not included
269        if !options.include_hidden_files && is_hidden_file(path) {
270            continue;
271        }
272        
273        // Check ignore patterns
274        if should_ignore_file(path, &options.ignore_patterns) {
275            continue;
276        }
277        
278        // Check file size
279        if let Ok(metadata) = path.metadata() {
280            let size_mb = metadata.len() as usize / (1024 * 1024);
281            if size_mb > options.max_file_size_mb {
282                continue;
283            }
284        }
285        
286        // Check if we can detect the language
287        if detect_language_by_extension(&path.to_string_lossy()).is_some() {
288            files.push(path.to_path_buf());
289        }
290    }
291    
292    Ok(files)
293}
294
295/// Collect files with custom filter criteria
296/// 
297/// This internal function extends the basic file collection with additional
298/// filtering capabilities provided by a `FileFilter`.
299/// 
300/// # Arguments
301/// 
302/// * `root_path` - Root directory path to start collection from
303/// * `options` - Parsing options that control file selection
304/// * `filter` - Custom filter criteria for more precise file selection
305/// 
306/// # Returns
307/// 
308/// A vector of file paths that match both the parsing options and the custom
309/// filter criteria.
310fn collect_files_with_filter(
311    root_path: &Path,
312    options: &ParseOptions,
313    filter: &crate::FileFilter,
314) -> Result<Vec<PathBuf>, Error> {
315    let mut files = collect_files(root_path, options)?;
316    
317    // Apply custom filter
318    files.retain(|path| {
319        // Check extensions
320        if let Some(ref extensions) = filter.extensions {
321            if let Some(ext) = path.extension() {
322                if !extensions.contains(&ext.to_string_lossy().to_lowercase()) {
323                    return false;
324                }
325            } else {
326                return false;
327            }
328        }
329        
330        // Check languages
331        if let Some(ref languages) = filter.languages {
332            if let Some(detected_lang) = detect_language_by_extension(&path.to_string_lossy()) {
333                if !languages.contains(&detected_lang) {
334                    return false;
335                }
336            } else {
337                return false;
338            }
339        }
340        
341        // Check file size
342        if let Ok(metadata) = path.metadata() {
343            let size = metadata.len() as usize;
344            
345            if let Some(min_size) = filter.min_size_bytes {
346                if size < min_size {
347                    return false;
348                }
349            }
350            
351            if let Some(max_size) = filter.max_size_bytes {
352                if size > max_size {
353                    return false;
354                }
355            }
356        }
357        
358        // Apply custom predicate
359        if let Some(ref predicate) = filter.custom_predicate {
360            if !predicate(path) {
361                return false;
362            }
363        }
364        
365        true
366    });
367    
368    Ok(files)
369}
370
371/// Parse files in parallel
372async fn parse_files_parallel(
373    files: Vec<PathBuf>,
374    options: &ParseOptions,
375) -> (Vec<ParsedFile>, Vec<FileError>) {
376    let chunk_size = std::cmp::max(1, files.len() / options.max_concurrent_files);
377    let mut parsed_files = Vec::new();
378    let mut error_files = Vec::new();
379    
380    for chunk in files.chunks(chunk_size) {
381        let chunk_results: Vec<_> = chunk
382            .iter()
383            .map(|path| async move {
384                let path_str = path.to_string_lossy().to_string();
385                
386                // Detect language
387                let language = match options.language_detection {
388                    LanguageDetection::ByExtension => detect_language_by_extension(&path_str),
389                    LanguageDetection::Combined => {
390                        // Try to read content for better detection
391                        if let Ok(content) = tokio::fs::read_to_string(path).await {
392                            detect_language(&path_str, Some(&content))
393                        } else {
394                            detect_language_by_extension(&path_str)
395                        }
396                    }
397                    _ => detect_language_by_extension(&path_str), // Fallback
398                };
399                
400                if let Some(lang) = language {
401                    match parse_file(&path_str, lang).await {
402                        Ok(parsed) => Ok(parsed),
403                        Err(e) => Err(FileError {
404                            file_path: path_str,
405                            error_type: ErrorType::ParseError,
406                            message: e.to_string(),
407                        }),
408                    }
409                } else {
410                    Err(FileError {
411                        file_path: path_str,
412                        error_type: ErrorType::UnsupportedLanguage,
413                        message: "Could not detect language".to_string(),
414                    })
415                }
416            })
417            .collect();
418        
419        // Await all tasks in this chunk
420        for result in futures::future::join_all(chunk_results).await {
421            match result {
422                Ok(parsed_file) => parsed_files.push(parsed_file),
423                Err(error) => error_files.push(error),
424            }
425        }
426    }
427    
428    (parsed_files, error_files)
429}
430
431/// Extract code constructs from syntax tree
432fn extract_constructs(tree: &Tree, source: &str, language: &Language) -> Vec<CodeConstruct> {
433    let root_node = tree.root_node();
434    let mut root_constructs = Vec::new();
435    
436    // Extract constructs with proper parent-child relationships
437    extract_constructs_hierarchical(root_node, source, language, &mut root_constructs, None);
438    
439    // Flatten the hierarchy for the final result while preserving relationships
440    let mut all_constructs = Vec::new();
441    flatten_constructs(&root_constructs, &mut all_constructs);
442    
443    all_constructs
444}
445
446/// Recursively extract constructs from nodes with proper hierarchy
447fn extract_constructs_hierarchical(
448    node: Node,
449    source: &str,
450    language: &Language,
451    constructs: &mut Vec<CodeConstruct>,
452    parent_construct: Option<&CodeConstruct>,
453) {
454    let node_type = node.kind();
455    let supported_types = get_supported_node_types(language);
456    
457    if supported_types.contains(&node_type.to_string()) {
458        let mut construct = create_code_construct_with_parent(node, source, language, parent_construct);
459        
460        // Recursively process children and add them to this construct
461        let mut child_constructs = Vec::new();
462        for i in 0..node.child_count() {
463            if let Some(child) = node.child(i) {
464                extract_constructs_hierarchical(child, source, language, &mut child_constructs, Some(&construct));
465            }
466        }
467        
468        construct.children = child_constructs;
469        constructs.push(construct);
470    } else {
471        // If this node is not a supported construct, continue searching in its children
472        for i in 0..node.child_count() {
473            if let Some(child) = node.child(i) {
474                extract_constructs_hierarchical(child, source, language, constructs, parent_construct);
475            }
476        }
477    }
478}
479
480/// Flatten hierarchical constructs into a single vector while preserving relationships
481fn flatten_constructs(constructs: &[CodeConstruct], flattened: &mut Vec<CodeConstruct>) {
482    for construct in constructs {
483        flattened.push(construct.clone());
484        flatten_constructs(&construct.children, flattened);
485    }
486}
487
488/// Create a CodeConstruct from a tree-sitter node with proper parent relationship
489fn create_code_construct_with_parent(
490    node: Node, 
491    source: &str, 
492    language: &Language,
493    parent_construct: Option<&CodeConstruct>
494) -> CodeConstruct {
495    let start_byte = node.start_byte();
496    let end_byte = node.end_byte();
497    let source_code = source[start_byte..end_byte].to_string();
498    
499    let start_point = node.start_position();
500    let end_point = node.end_position();
501    
502    // Extract name if possible
503    let name = extract_construct_name(node, source);
504    
505    // Create metadata
506    let metadata = extract_metadata(node, source, language);
507    
508    // Set parent if provided
509    let parent = parent_construct.map(|p| Box::new(p.clone()));
510    
511    CodeConstruct {
512        node_type: node.kind().to_string(),
513        name,
514        source_code,
515        start_line: start_point.row + 1, // Convert to 1-based
516        end_line: end_point.row + 1,
517        start_byte,
518        end_byte,
519        parent,
520        children: Vec::new(), // Will be populated by the caller
521        metadata,
522    }
523}
524
525/// Extract construct name from node
526fn extract_construct_name(node: Node, source: &str) -> Option<String> {
527    // Try to find identifier child
528    for i in 0..node.child_count() {
529        if let Some(child) = node.child(i) {
530            if child.kind() == "identifier" || child.kind() == "name" {
531                let start = child.start_byte();
532                let end = child.end_byte();
533                return Some(source[start..end].to_string());
534            }
535        }
536    }
537    None
538}
539
540#[cfg(test)]
541mod tests {
542    use super::*;
543    use crate::Language;
544
545    #[test]
546    fn test_parent_child_relationships() {
547        // Simple Python code with nested structure
548        let source = "class TestClass:\n    def test_method(self):\n        pass";
549        
550        // Create a simple tree-sitter parser for testing
551        let mut parser = Parser::new();
552        let language = crate::languages::get_tree_sitter_language(&Language::Python).unwrap();
553        parser.set_language(&language).unwrap();
554        
555        let tree = parser.parse(source, None).unwrap();
556        let constructs = extract_constructs(&tree, source, &Language::Python);
557        
558        // Find class and method constructs
559        let class_construct = constructs.iter().find(|c| c.node_type == "class_definition");
560        let method_construct = constructs.iter().find(|c| c.node_type == "function_definition");
561        
562        assert!(class_construct.is_some(), "Should find class construct");
563        assert!(method_construct.is_some(), "Should find method construct");
564        
565        let method = method_construct.unwrap();
566        
567        // Check that method has a parent
568        assert!(method.parent.is_some(), "Method should have a parent");
569        
570        if let Some(parent) = &method.parent {
571            assert_eq!(parent.node_type, "class_definition", "Method's parent should be the class");
572        }
573        
574        // Check that class has children
575        let class = class_construct.unwrap();
576        assert!(!class.children.is_empty(), "Class should have children");
577        
578        let child_method = class.children.iter().find(|c| c.node_type == "function_definition");
579        assert!(child_method.is_some(), "Class should contain the method as a child");
580    }
581}
582
583/// Extract metadata from node
584fn extract_metadata(_node: Node, _source: &str, _language: &Language) -> ConstructMetadata {
585    ConstructMetadata {
586        visibility: None,
587        modifiers: Vec::new(),
588        parameters: Vec::new(),
589        return_type: None,
590        inheritance: Vec::new(),
591        annotations: Vec::new(),
592        documentation: None,
593    }
594}
595
596/// Check if file is hidden
597fn is_hidden_file(path: &Path) -> bool {
598    path.file_name()
599        .and_then(|name| name.to_str())
600        .map(|name| name.starts_with('.'))
601        .unwrap_or(false)
602}
603
604/// Check if file should be ignored based on patterns
605fn should_ignore_file(path: &Path, ignore_patterns: &[String]) -> bool {
606    let path_str = path.to_string_lossy();
607    
608    for pattern in ignore_patterns {
609        if path_str.contains(pattern) {
610            return true;
611        }
612    }
613    
614    false
615}