tree_parser/
lib.rs

1//! # Tree Parser Library
2//!
3//! A comprehensive Rust library for parsing and searching code elements across multiple programming languages
4//! using tree-sitter. This library provides powerful tools for static code analysis, code search, and AST manipulation.
5//!
6//! ## Features
7//!
8//! - **Multi-language Support**: Parse Python, Rust, JavaScript, TypeScript, Java, C, C++, Go, and more
9//! - **High Performance**: Concurrent parsing with async/await for maximum efficiency
10//! - **Advanced Search**: Find functions, classes, structs, interfaces with regex pattern matching
11//! - **Flexible Filtering**: Custom file filters and parsing options
12//! - **Rich Metadata**: Extract detailed information about code constructs
13//! - **Type Safety**: Full Rust type safety with comprehensive error handling
14//! - **Configurable**: Extensive configuration options for different use cases
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use tree_parser::{parse_file, ParseOptions, Language};
20//!
21//! #[tokio::main]
22//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//!     // Parse a single file
24//!     let parsed_file = parse_file("src/main.rs", ParseOptions::default()).await?;
25//!     
26//!     println!("Found {} constructs", parsed_file.constructs.len());
27//!     for construct in &parsed_file.constructs {
28//!         if let Some(name) = &construct.name {
29//!             println!("{}: {} (lines {}-{})", 
30//!                 construct.node_type, name, 
31//!                 construct.start_line, construct.end_line);
32//!         }
33//!     }
34//!     
35//!     Ok(())
36//! }
37//! ```
38//!
39//! ## Examples
40//!
41//! See the `examples/` directory for comprehensive usage examples including:
42//! - Basic parsing and directory traversal
43//! - Advanced search with regex patterns
44//! - Custom file filtering
45//! - Performance optimization
46//! - Error handling strategies
47
48use std::collections::HashMap;
49use std::path::Path;
50use std::sync::Arc;
51
52use serde::{Deserialize, Serialize};
53use thiserror::Error;
54use tree_sitter::Tree;
55
56// Re-export commonly used types
57pub use tree_sitter::{Point, Range};
58
59// Language modules
60mod languages;
61pub use languages::*;
62
63/// Main error type for the tree parser library
64/// 
65/// This enum represents all possible errors that can occur during parsing operations.
66/// All variants are serializable and provide detailed error information.
67#[derive(Error, Debug, Clone, Serialize, Deserialize)]
68pub enum Error {
69    #[error("IO error: {0}")]
70    Io(String),
71    #[error("Parse error: {0}")]
72    Parse(String),
73    #[error("Unsupported language: {0}")]
74    UnsupportedLanguage(String),
75    #[error("File too large: {0} bytes")]
76    FileTooLarge(usize),
77    #[error("Permission denied: {0}")]
78    PermissionDenied(String),
79    #[error("Invalid query: {0}")]
80    InvalidQuery(String),
81}
82
83/// Categorizes different types of errors for easier handling
84/// 
85/// This enum is used to classify errors into broad categories, making it easier
86/// to implement different error handling strategies for different error types.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub enum ErrorType {
89    ParseError,
90    IoError,
91    UnsupportedLanguage,
92    FileTooLarge,
93    PermissionDenied,
94}
95
96/// Represents an error that occurred while processing a specific file
97/// 
98/// This struct contains detailed information about parsing failures,
99/// including the file path, error type, and a descriptive message.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct FileError {
102    pub file_path: String,
103    pub error_type: ErrorType,
104    pub message: String,
105}
106
107/// Supported programming languages
108/// 
109/// This enum represents all programming languages that the tree parser can handle.
110/// Each language corresponds to a specific tree-sitter grammar.
111/// 
112/// # Feature Flags
113/// 
114/// Most languages are gated behind feature flags to reduce compilation time and binary size:
115/// - `python` - Python support
116/// - `rust_lang` - Rust support  
117/// - `javascript` - JavaScript support
118/// - `typescript` - TypeScript support
119/// - `java` - Java support
120/// - `c` - C support
121/// - `cpp` - C++ support
122/// - `go` - Go support
123/// - `full` - All languages
124#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
125pub enum Language {
126    Python,
127    Rust,
128    JavaScript,
129    TypeScript,
130    Java,
131    C,
132    Cpp,
133    Go,
134    CSharp,
135    Php,
136    Ruby,
137    Swift,
138    Kotlin,
139    Scala,
140    Haskell,
141    Lua,
142    Perl,
143    R,
144    Bash,
145    PowerShell,
146    Html,
147    Css,
148    Sql,
149    Json,
150    Yaml,
151    Toml,
152    Xml,
153}
154
155/// Methods for detecting the programming language of a file
156/// 
157/// This enum defines different strategies for automatically determining
158/// the programming language of a source code file.
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub enum LanguageDetection {
161    ByExtension,
162    ByContent,
163    ByShebang,
164    Combined, // Uses all methods with fallback priority
165}
166
167/// Represents a function or method parameter
168/// 
169/// This struct contains detailed information about a parameter including
170/// its name, type, default value, and whether it's variadic.
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct Parameter {
173    pub name: String,
174    pub param_type: Option<String>,
175    pub default_value: Option<String>,
176    pub is_variadic: bool,
177}
178
179/// Metadata associated with a code construct
180/// 
181/// This struct contains additional information about code constructs such as
182/// visibility modifiers, parameters, return types, inheritance, and documentation.
183#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct ConstructMetadata {
185    pub visibility: Option<String>,
186    pub modifiers: Vec<String>,
187    pub parameters: Vec<Parameter>,
188    pub return_type: Option<String>,
189    pub inheritance: Vec<String>,
190    pub annotations: Vec<String>,
191    pub documentation: Option<String>,
192}
193
194/// Represents a parsed code construct (function, class, struct, etc.)
195/// 
196/// This is the core data structure that represents any identifiable code element
197/// found during parsing. It includes the construct's location, content, metadata,
198/// and hierarchical relationships with other constructs.
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct CodeConstruct {
201    pub node_type: String,
202    pub name: Option<String>,
203    pub source_code: String,
204    pub start_line: usize,
205    pub end_line: usize,
206    pub start_byte: usize,
207    pub end_byte: usize,
208    pub parent: Option<Box<CodeConstruct>>,
209    pub children: Vec<CodeConstruct>,
210    pub metadata: ConstructMetadata,
211}
212
213/// Represents a successfully parsed source code file
214/// 
215/// This struct contains all information extracted from a single file,
216/// including the parsed constructs, metadata, and performance metrics.
217#[derive(Debug, Clone, Serialize, Deserialize)]
218pub struct ParsedFile {
219    pub file_path: String,
220    pub relative_path: String,
221    pub language: Language,
222    pub constructs: Vec<CodeConstruct>,
223    #[serde(skip)]
224    pub syntax_tree: Option<Tree>,
225    pub file_size_bytes: usize,
226    pub parse_time_ms: u64,
227}
228
229/// Represents the results of parsing an entire project or directory
230/// 
231/// This struct aggregates the results of parsing multiple files,
232/// including success metrics, error information, and language distribution.
233#[derive(Debug, Clone, Serialize, Deserialize)]
234pub struct ParsedProject {
235    pub root_path: String,
236    pub files: Vec<ParsedFile>,
237    pub total_files_processed: usize,
238    pub processing_time_ms: u64,
239    pub language_distribution: HashMap<Language, usize>,
240    pub error_files: Vec<FileError>,
241}
242
243/// Filter criteria for selecting which files to parse
244/// 
245/// This struct allows you to specify various criteria for filtering files
246/// during directory parsing operations. All criteria are optional and are
247/// combined with AND logic when multiple criteria are specified.
248/// 
249/// # Examples
250/// 
251/// ```rust
252/// use tree_parser::{FileFilter, Language};
253/// use std::sync::Arc;
254/// 
255/// // Filter for Rust files only
256/// let filter = FileFilter {
257///     languages: Some(vec![Language::Rust]),
258///     extensions: None,
259///     min_size_bytes: None,
260///     max_size_bytes: None,
261///     custom_predicate: None,
262/// };
263/// 
264/// // Filter with custom logic
265/// let filter = FileFilter {
266///     languages: None,
267///     extensions: Some(vec!["rs".to_string(), "py".to_string()]),
268///     min_size_bytes: Some(100),
269///     max_size_bytes: Some(50_000),
270///     custom_predicate: Some(Arc::new(|path| {
271///         !path.to_string_lossy().contains("test")
272///     })),
273/// };
274/// ```
275#[derive(Clone)]
276pub struct FileFilter {
277    /// File extensions to include (e.g., ["rs", "py"]). None means all supported extensions.
278    pub extensions: Option<Vec<String>>,
279    /// Programming languages to include. None means all supported languages.
280    pub languages: Option<Vec<Language>>,
281    /// Minimum file size in bytes. Files smaller than this are excluded.
282    pub min_size_bytes: Option<usize>,
283    /// Maximum file size in bytes. Files larger than this are excluded.
284    pub max_size_bytes: Option<usize>,
285    /// Custom predicate function for advanced filtering logic
286    pub custom_predicate: Option<Arc<dyn Fn(&Path) -> bool + Send + Sync>>,
287}
288
289/// Configuration options for parsing operations
290/// 
291/// This struct provides extensive configuration options for controlling
292/// how files are parsed, including concurrency settings, file size limits,
293/// and language detection strategies.
294/// 
295/// # Examples
296/// 
297/// ```rust
298/// use tree_parser::{ParseOptions, LanguageDetection};
299/// 
300/// // Use default options
301/// let options = ParseOptions::default();
302/// 
303/// // Custom configuration
304/// let options = ParseOptions {
305///     max_concurrent_files: 8,
306///     include_hidden_files: false,
307///     max_file_size_mb: 5,
308///     recursive: true,
309///     ignore_patterns: vec!["target".to_string(), "node_modules".to_string()],
310///     language_detection: LanguageDetection::Combined,
311///     enable_caching: true,
312///     thread_pool_size: Some(4),
313/// };
314/// ```
315#[derive(Debug, Clone, Serialize, Deserialize)]
316pub struct ParseOptions {
317    /// Maximum number of files to parse concurrently (default: 2 * CPU cores)
318    pub max_concurrent_files: usize,
319    /// Whether to include hidden files (files starting with '.') in parsing
320    pub include_hidden_files: bool,
321    /// Maximum file size in megabytes to parse (larger files are skipped)
322    pub max_file_size_mb: usize,
323    /// Whether to recursively parse subdirectories
324    pub recursive: bool,
325    /// Patterns to ignore during directory traversal (supports glob patterns)
326    pub ignore_patterns: Vec<String>,
327    /// Strategy for detecting the programming language of files
328    pub language_detection: LanguageDetection,
329    /// Whether to enable internal caching for improved performance
330    pub enable_caching: bool,
331    /// Optional thread pool size (None uses system default)
332    pub thread_pool_size: Option<usize>,
333}
334
335impl Default for ParseOptions {
336    fn default() -> Self {
337        Self {
338            max_concurrent_files: num_cpus::get() * 2,
339            include_hidden_files: false,
340            max_file_size_mb: 10,
341            recursive: true,
342            ignore_patterns: vec![
343                "node_modules".to_string(),
344                ".git".to_string(),
345                "target".to_string(),
346                "build".to_string(),
347            ],
348            language_detection: LanguageDetection::ByExtension,
349            enable_caching: true,
350            thread_pool_size: None, // Uses system default
351        }
352    }
353}
354
355// Core API functions will be implemented in separate modules
356mod parser;
357mod search;
358mod utils;
359
360pub use parser::*;
361pub use search::*;
362pub use utils::*;
363// pub use test_compile::*; // Commented out as not currently used
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368
369    #[test]
370    fn test_language_detection() {
371        assert_eq!(detect_language_by_extension("test.py"), Some(Language::Python));
372        assert_eq!(detect_language_by_extension("test.rs"), Some(Language::Rust));
373        assert_eq!(detect_language_by_extension("test.js"), Some(Language::JavaScript));
374    }
375}