tree_parser/
lib.rs

1//! # Tree Parser Library
2//!
3//! A comprehensive Rust library for parsing and searching code elements across multiple programming languages
4//! using tree-sitter. This library provides powerful tools for static code analysis, code search, and AST manipulation.
5//!
6//! ## Features
7//!
8//! - **Multi-language Support**: Parse Python, Rust, JavaScript, TypeScript, Java, C, C++, Go, and more
9//! - **High Performance**: Concurrent parsing with async/await for maximum efficiency
10//! - **Advanced Search**: Find functions, classes, structs, interfaces with regex pattern matching
11//! - **Flexible Filtering**: Custom file filters and parsing options
12//! - **Rich Metadata**: Extract detailed information about code constructs
13//! - **Type Safety**: Full Rust type safety with comprehensive error handling
14//! - **Configurable**: Extensive configuration options for different use cases
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use tree_parser::{parse_file, Language};
20//!
21//! #[tokio::main]
22//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//!     // Parse a single file
24//!     let parsed_file = parse_file("src/main.rs", Language::Rust).await?;
25//!     
26//!     println!("Found {} constructs", parsed_file.constructs.len());
27//!     for construct in &parsed_file.constructs {
28//!         if let Some(name) = &construct.name {
29//!             println!("{}: {} (lines {}-{})", 
30//!                 construct.node_type, name, 
31//!                 construct.start_line, construct.end_line);
32//!         }
33//!     }
34//!     
35//!     Ok(())
36//! }
37//! ```
38//!
39//! ## Finding Code Constructs
40//!
41//! This library provides several powerful methods to search for specific code constructs:
42//!
43//! ### 1. Search by Node Type
44//!
45//! ```rust
46//! use tree_parser::{parse_file, search_by_node_type, Language};
47//!
48//! #[tokio::main]
49//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
50//!     let parsed_file = parse_file("example.py", Language::Python).await?;
51//!     
52//!     // Find all function definitions
53//!     let functions = search_by_node_type(&parsed_file, "function_definition", None);
54//!     
55//!     // Find test functions using regex
56//!     let test_functions = search_by_node_type(&parsed_file, "function_definition", Some(r"^test_.*"));
57//!     
58//!     println!("Found {} functions, {} are tests", functions.len(), test_functions.len());
59//!     Ok(())
60//! }
61//! ```
62//!
63//! ### 2. Search by Multiple Node Types
64//!
65//! ```rust
66//! use tree_parser::{parse_file, search_by_multiple_node_types, Language};
67//!
68//! #[tokio::main]
69//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
70//!     let parsed_file = parse_file("example.js", Language::JavaScript).await?;
71//!     
72//!     // Find all function-like constructs
73//!     let functions = search_by_multiple_node_types(
74//!         &parsed_file,
75//!         &["function_declaration", "function_expression", "arrow_function"],
76//!         None
77//!     );
78//!     
79//!     println!("Found {} function-like constructs", functions.len());
80//!     Ok(())
81//! }
82//! ```
83//!
84//! ### 3. Advanced Search with Tree-sitter Queries
85//!
86//! ```rust
87//! use tree_parser::{parse_file, search_by_query, Language};
88//!
89//! #[tokio::main]
90//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
91//!     let parsed_file = parse_file("example.py", Language::Python).await?;
92//!     
93//!     // Find all class definitions with their methods
94//!     let query = r#"
95//!         (class_definition
96//!           name: (identifier) @class_name
97//!           body: (block
98//!             (function_definition
99//!               name: (identifier) @method_name)))
100//!     "#;
101//!     
102//!     let classes_with_methods = search_by_query(&parsed_file, query)?;
103//!     println!("Found {} classes with methods", classes_with_methods.len());
104//!     Ok(())
105//! }
106//! ```
107//!
108//! ## Finding Node Types
109//!
110//! To effectively search for code constructs, you need to know the correct node types.
111//! Here are the most common node types by language:
112//!
113//! ### Python
114//! - `function_definition` - Function definitions
115//! - `class_definition` - Class definitions  
116//! - `import_statement` - Import statements
117//! - `decorated_definition` - Functions/classes with decorators
118//! - `assignment` - Variable assignments
119//!
120//! ### Rust
121//! - `function_item` - Function definitions
122//! - `struct_item` - Struct definitions
123//! - `impl_item` - Implementation blocks
124//! - `trait_item` - Trait definitions
125//! - `enum_item` - Enum definitions
126//! - `mod_item` - Module definitions
127//!
128//! ### JavaScript/TypeScript
129//! - `function_declaration` - Function declarations
130//! - `function_expression` - Function expressions
131//! - `arrow_function` - Arrow functions
132//! - `method_definition` - Class methods
133//! - `class_declaration` - Class declarations
134//!
135//! ### Java
136//! - `method_declaration` - Method definitions
137//! - `class_declaration` - Class declarations
138//! - `interface_declaration` - Interface declarations
139//! - `constructor_declaration` - Constructor definitions
140//!
141//! For a complete list of node types, inspect your parsed files or consult the
142//! tree-sitter grammar documentation for your target language.
143//!
144//! ### Discovering Node Types
145//!
146//! ```rust
147//! use tree_parser::{parse_file, Language};
148//! use std::collections::HashSet;
149//!
150//! #[tokio::main]
151//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
152//!     let parsed_file = parse_file("your_file.py", Language::Python).await?;
153//!     
154//!     // Collect all unique node types
155//!     let mut node_types: HashSet<String> = HashSet::new();
156//!     for construct in &parsed_file.constructs {
157//!         node_types.insert(construct.node_type.clone());
158//!     }
159//!     
160//!     println!("Available node types:");
161//!     for node_type in &node_types {
162//!         println!("  - {}", node_type);
163//!     }
164//!     
165//!     Ok(())
166//! }
167//! ```
168//!
169//! ### 4. Online Tree-sitter Playground
170//!
171//! Use the [Tree-sitter Playground](https://tree-sitter.github.io/tree-sitter/playground) to:
172//! 1. Paste your code
173//! 2. Select the appropriate language
174//! 3. Explore the generated syntax tree
175//! 4. Identify the exact node types you need
176//!
177//! ## Best Practices
178//!
179//! ### Performance Optimization
180//! - Increase `max_concurrent_files` for better performance on multi-core systems
181//! - Use file filters to exclude unnecessary files (node_modules, target, .git, etc.)
182//! - Set appropriate `max_file_size_mb` limits to skip very large files
183//! - Enable caching with `enable_caching: true` for repeated operations
184//! - Use `LanguageDetection::ByExtension` for faster processing
185//!
186//! ### Memory Management
187//! - Set `syntax_tree: None` after extracting constructs if you don't need the tree
188//! - Process files in batches rather than loading entire projects
189//! - Use streaming approaches for very large codebases
190//!
191//! ### Error Handling
192//! - Always check `project.error_files` for individual file parsing errors
193//! - Handle different `ErrorType` variants appropriately
194//! - Use proper error propagation with `?` operator
195//!
196//! ## Troubleshooting
197//!
198//! **Common Issues:**
199//! - "Unsupported language" error: Enable correct feature flags in Cargo.toml
200//! - "Parse error" for valid code: Check for syntax errors or unsupported language features
201//! - Poor performance: Increase concurrency, use filters, enable caching
202//! - Memory issues: Drop syntax trees after use, process in batches
203//! - Missing constructs: Verify node type names, check nesting, use tree-sitter queries
204
205use std::collections::HashMap;
206use std::path::Path;
207use std::sync::Arc;
208
209use serde::{Deserialize, Serialize};
210use thiserror::Error;
211use tree_sitter::Tree;
212
213// Re-export commonly used types
214pub use tree_sitter::{Point, Range};
215
216// Language modules
217mod languages;
218pub use languages::*;
219
220/// Main error type for the tree parser library
221/// 
222/// This enum represents all possible errors that can occur during parsing operations.
223/// All variants are serializable and provide detailed error information.
224#[derive(Error, Debug, Clone, Serialize, Deserialize)]
225pub enum Error {
226    #[error("IO error: {0}")]
227    Io(String),
228    #[error("Parse error: {0}")]
229    Parse(String),
230    #[error("Unsupported language: {0}")]
231    UnsupportedLanguage(String),
232    #[error("File too large: {0} bytes")]
233    FileTooLarge(usize),
234    #[error("Permission denied: {0}")]
235    PermissionDenied(String),
236    #[error("Invalid query: {0}")]
237    InvalidQuery(String),
238}
239
240/// Categorizes different types of errors for easier handling
241/// 
242/// This enum is used to classify errors into broad categories, making it easier
243/// to implement different error handling strategies for different error types.
244#[derive(Debug, Clone, Serialize, Deserialize)]
245pub enum ErrorType {
246    ParseError,
247    IoError,
248    UnsupportedLanguage,
249    FileTooLarge,
250    PermissionDenied,
251}
252
253/// Represents an error that occurred while processing a specific file
254/// 
255/// This struct contains detailed information about parsing failures,
256/// including the file path, error type, and a descriptive message.
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct FileError {
259    pub file_path: String,
260    pub error_type: ErrorType,
261    pub message: String,
262}
263
264/// Supported programming languages
265/// 
266/// This enum represents all programming languages that the tree parser can handle.
267/// Each language corresponds to a specific tree-sitter grammar.
268/// 
269/// # Feature Flags
270/// 
271/// Most languages are gated behind feature flags to reduce compilation time and binary size:
272/// - `python` - Python support
273/// - `rust_lang` - Rust support  
274/// - `javascript` - JavaScript support
275/// - `typescript` - TypeScript support
276/// - `java` - Java support
277/// - `c` - C support
278/// - `cpp` - C++ support
279/// - `go` - Go support
280/// - `full` - All languages
281#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
282pub enum Language {
283    Python,
284    Rust,
285    JavaScript,
286    TypeScript,
287    Java,
288    C,
289    Cpp,
290    Go,
291    CSharp,
292    Php,
293    Ruby,
294    Swift,
295    Kotlin,
296    Scala,
297    Haskell,
298    Lua,
299    Perl,
300    R,
301    Bash,
302    PowerShell,
303    Html,
304    Css,
305    Sql,
306    Json,
307    Yaml,
308    Toml,
309    Xml,
310}
311
312/// Methods for detecting the programming language of a file
313/// 
314/// This enum defines different strategies for automatically determining
315/// the programming language of a source code file.
316#[derive(Debug, Clone, Serialize, Deserialize)]
317pub enum LanguageDetection {
318    ByExtension,
319    ByContent,
320    ByShebang,
321    Combined, // Uses all methods with fallback priority
322}
323
324/// Represents a function or method parameter
325/// 
326/// This struct contains detailed information about a parameter including
327/// its name, type, default value, and whether it's variadic.
328#[derive(Debug, Clone, Serialize, Deserialize)]
329pub struct Parameter {
330    pub name: String,
331    pub param_type: Option<String>,
332    pub default_value: Option<String>,
333    pub is_variadic: bool,
334}
335
336/// Metadata associated with a code construct
337/// 
338/// This struct contains additional information about code constructs such as
339/// visibility modifiers, parameters, return types, inheritance, and documentation.
340#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct ConstructMetadata {
342    pub visibility: Option<String>,
343    pub modifiers: Vec<String>,
344    pub parameters: Vec<Parameter>,
345    pub return_type: Option<String>,
346    pub inheritance: Vec<String>,
347    pub annotations: Vec<String>,
348    pub documentation: Option<String>,
349}
350
351/// Represents a parsed code construct (function, class, struct, etc.)
352/// 
353/// This is the core data structure that represents any identifiable code element
354/// found during parsing. It includes the construct's location, content, metadata,
355/// and hierarchical relationships with other constructs.
356#[derive(Debug, Clone, Serialize, Deserialize)]
357pub struct CodeConstruct {
358    pub node_type: String,
359    pub name: Option<String>,
360    pub source_code: String,
361    pub start_line: usize,
362    pub end_line: usize,
363    pub start_byte: usize,
364    pub end_byte: usize,
365    pub parent: Option<Box<CodeConstruct>>,
366    pub children: Vec<CodeConstruct>,
367    pub metadata: ConstructMetadata,
368}
369
370/// Represents a successfully parsed source code file
371/// 
372/// This struct contains all information extracted from a single file,
373/// including the parsed constructs, metadata, and performance metrics.
374#[derive(Debug, Clone, Serialize, Deserialize)]
375pub struct ParsedFile {
376    pub file_path: String,
377    pub relative_path: String,
378    pub language: Language,
379    pub constructs: Vec<CodeConstruct>,
380    #[serde(skip)]
381    pub syntax_tree: Option<Tree>,
382    pub file_size_bytes: usize,
383}
384
385/// Represents the results of parsing an entire project or directory
386/// 
387/// This struct aggregates the results of parsing multiple files,
388/// including success metrics, error information, and language distribution.
389#[derive(Debug, Clone, Serialize, Deserialize)]
390pub struct ParsedProject {
391    pub root_path: String,
392    pub files: Vec<ParsedFile>,
393    pub total_files_processed: usize,
394    pub language_distribution: HashMap<Language, usize>,
395    pub error_files: Vec<FileError>,
396}
397
398/// Filter criteria for selecting which files to parse
399/// 
400/// This struct allows you to specify various criteria for filtering files
401/// during directory parsing operations. All criteria are optional and are
402/// combined with AND logic when multiple criteria are specified.
403/// 
404/// # Examples
405/// 
406/// ```rust
407/// use tree_parser::{FileFilter, Language};
408/// use std::sync::Arc;
409/// 
410/// // Filter for Rust files only
411/// let filter = FileFilter {
412///     languages: Some(vec![Language::Rust]),
413///     extensions: None,
414///     min_size_bytes: None,
415///     max_size_bytes: None,
416///     custom_predicate: None,
417/// };
418/// 
419/// // Filter with custom logic
420/// let filter = FileFilter {
421///     languages: None,
422///     extensions: Some(vec!["rs".to_string(), "py".to_string()]),
423///     min_size_bytes: Some(100),
424///     max_size_bytes: Some(50_000),
425///     custom_predicate: Some(Arc::new(|path| {
426///         !path.to_string_lossy().contains("test")
427///     })),
428/// };
429/// ```
430#[derive(Clone)]
431pub struct FileFilter {
432    /// File extensions to include (e.g., ["rs", "py"]). None means all supported extensions.
433    pub extensions: Option<Vec<String>>,
434    /// Programming languages to include. None means all supported languages.
435    pub languages: Option<Vec<Language>>,
436    /// Minimum file size in bytes. Files smaller than this are excluded.
437    pub min_size_bytes: Option<usize>,
438    /// Maximum file size in bytes. Files larger than this are excluded.
439    pub max_size_bytes: Option<usize>,
440    /// Custom predicate function for advanced filtering logic
441    pub custom_predicate: Option<Arc<dyn Fn(&Path) -> bool + Send + Sync>>,
442}
443
444/// Configuration options for parsing operations
445/// 
446/// This struct provides extensive configuration options for controlling
447/// how files are parsed, including concurrency settings, file size limits,
448/// and language detection strategies.
449/// 
450/// # Examples
451/// 
452/// ```rust
453/// use tree_parser::{ParseOptions, LanguageDetection};
454/// 
455/// // Use default options
456/// let options = ParseOptions::default();
457/// 
458/// // Custom configuration
459/// let options = ParseOptions {
460///     max_concurrent_files: 8,
461///     include_hidden_files: false,
462///     max_file_size_mb: 5,
463///     recursive: true,
464///     ignore_patterns: vec!["target".to_string(), "node_modules".to_string()],
465///     language_detection: LanguageDetection::Combined,
466///     enable_caching: true,
467///     thread_pool_size: Some(4),
468/// };
469/// ```
470#[derive(Debug, Clone, Serialize, Deserialize)]
471pub struct ParseOptions {
472    /// Maximum number of files to parse concurrently (default: 2 * CPU cores)
473    pub max_concurrent_files: usize,
474    /// Whether to include hidden files (files starting with '.') in parsing
475    pub include_hidden_files: bool,
476    /// Maximum file size in megabytes to parse (larger files are skipped)
477    pub max_file_size_mb: usize,
478    /// Whether to recursively parse subdirectories
479    pub recursive: bool,
480    /// Patterns to ignore during directory traversal (supports glob patterns)
481    pub ignore_patterns: Vec<String>,
482    /// Strategy for detecting the programming language of files
483    pub language_detection: LanguageDetection,
484    /// Whether to enable internal caching for improved performance
485    pub enable_caching: bool,
486    /// Optional thread pool size (None uses system default)
487    pub thread_pool_size: Option<usize>,
488}
489
490impl Default for ParseOptions {
491    fn default() -> Self {
492        Self {
493            max_concurrent_files: num_cpus::get() * 2,
494            include_hidden_files: false,
495            max_file_size_mb: 10,
496            recursive: true,
497            ignore_patterns: vec![
498                "node_modules".to_string(),
499                ".git".to_string(),
500                "target".to_string(),
501                "build".to_string(),
502            ],
503            language_detection: LanguageDetection::ByExtension,
504            enable_caching: true,
505            thread_pool_size: None, // Uses system default
506        }
507    }
508}
509
510// Core API functions will be implemented in separate modules
511mod parser;
512mod search;
513mod utils;
514
515pub use parser::*;
516pub use search::*;
517pub use utils::*;
518// pub use test_compile::*; // Commented out as not currently used
519
520#[cfg(test)]
521mod tests {
522    use super::*;
523
524    #[test]
525    fn test_language_detection() {
526        assert_eq!(detect_language_by_extension("test.py"), Some(Language::Python));
527        assert_eq!(detect_language_by_extension("test.rs"), Some(Language::Rust));
528        assert_eq!(detect_language_by_extension("test.js"), Some(Language::JavaScript));
529    }
530}