tree_parser/lib.rs
1//! # Tree Parser Library
2//!
3//! A comprehensive Rust library for parsing and searching code elements across multiple programming languages
4//! using tree-sitter. This library provides powerful tools for static code analysis, code search, and AST manipulation.
5//!
6//! ## Features
7//!
8//! - **Multi-language Support**: Parse Python, Rust, JavaScript, TypeScript, Java, C, C++, Go, and more
9//! - **High Performance**: Concurrent parsing with async/await for maximum efficiency
10//! - **Advanced Search**: Find functions, classes, structs, interfaces with regex pattern matching
11//! - **Flexible Filtering**: Custom file filters and parsing options
12//! - **Rich Metadata**: Extract detailed information about code constructs
13//! - **Type Safety**: Full Rust type safety with comprehensive error handling
14//! - **Configurable**: Extensive configuration options for different use cases
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use tree_parser::{parse_file, ParseOptions, Language};
20//!
21//! #[tokio::main]
22//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! // Parse a single file
24//! let parsed_file = parse_file("src/main.rs", ParseOptions::default()).await?;
25//!
26//! println!("Found {} constructs", parsed_file.constructs.len());
27//! for construct in &parsed_file.constructs {
28//! if let Some(name) = &construct.name {
29//! println!("{}: {} (lines {}-{})",
30//! construct.node_type, name,
31//! construct.start_line, construct.end_line);
32//! }
33//! }
34//!
35//! Ok(())
36//! }
37//! ```
38//!
39//! ## Examples
40//!
41//! See the `examples/` directory for comprehensive usage examples including:
42//! - Basic parsing and directory traversal
43//! - Advanced search with regex patterns
44//! - Custom file filtering
45//! - Performance optimization
46//! - Error handling strategies
47
48use std::collections::HashMap;
49use std::path::Path;
50use std::sync::Arc;
51
52use serde::{Deserialize, Serialize};
53use thiserror::Error;
54use tree_sitter::Tree;
55
56// Re-export commonly used types
57pub use tree_sitter::{Point, Range};
58
59// Language modules
60mod languages;
61pub use languages::*;
62
63/// Main error type for the tree parser library
64///
65/// This enum represents all possible errors that can occur during parsing operations.
66/// All variants are serializable and provide detailed error information.
67#[derive(Error, Debug, Clone, Serialize, Deserialize)]
68pub enum Error {
69 #[error("IO error: {0}")]
70 Io(String),
71 #[error("Parse error: {0}")]
72 Parse(String),
73 #[error("Unsupported language: {0}")]
74 UnsupportedLanguage(String),
75 #[error("File too large: {0} bytes")]
76 FileTooLarge(usize),
77 #[error("Permission denied: {0}")]
78 PermissionDenied(String),
79 #[error("Invalid query: {0}")]
80 InvalidQuery(String),
81}
82
83/// Categorizes different types of errors for easier handling
84///
85/// This enum is used to classify errors into broad categories, making it easier
86/// to implement different error handling strategies for different error types.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub enum ErrorType {
89 ParseError,
90 IoError,
91 UnsupportedLanguage,
92 FileTooLarge,
93 PermissionDenied,
94}
95
96/// Represents an error that occurred while processing a specific file
97///
98/// This struct contains detailed information about parsing failures,
99/// including the file path, error type, and a descriptive message.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct FileError {
102 pub file_path: String,
103 pub error_type: ErrorType,
104 pub message: String,
105}
106
107/// Supported programming languages
108///
109/// This enum represents all programming languages that the tree parser can handle.
110/// Each language corresponds to a specific tree-sitter grammar.
111///
112/// # Feature Flags
113///
114/// Most languages are gated behind feature flags to reduce compilation time and binary size:
115/// - `python` - Python support
116/// - `rust_lang` - Rust support
117/// - `javascript` - JavaScript support
118/// - `typescript` - TypeScript support
119/// - `java` - Java support
120/// - `c` - C support
121/// - `cpp` - C++ support
122/// - `go` - Go support
123/// - `full` - All languages
124#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
125pub enum Language {
126 Python,
127 Rust,
128 JavaScript,
129 TypeScript,
130 Java,
131 C,
132 Cpp,
133 Go,
134 CSharp,
135 Php,
136 Ruby,
137 Swift,
138 Kotlin,
139 Scala,
140 Haskell,
141 Lua,
142 Perl,
143 R,
144 Bash,
145 PowerShell,
146 Html,
147 Css,
148 Sql,
149 Json,
150 Yaml,
151 Toml,
152 Xml,
153}
154
155/// Methods for detecting the programming language of a file
156///
157/// This enum defines different strategies for automatically determining
158/// the programming language of a source code file.
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub enum LanguageDetection {
161 ByExtension,
162 ByContent,
163 ByShebang,
164 Combined, // Uses all methods with fallback priority
165}
166
167/// Represents a function or method parameter
168///
169/// This struct contains detailed information about a parameter including
170/// its name, type, default value, and whether it's variadic.
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct Parameter {
173 pub name: String,
174 pub param_type: Option<String>,
175 pub default_value: Option<String>,
176 pub is_variadic: bool,
177}
178
179/// Metadata associated with a code construct
180///
181/// This struct contains additional information about code constructs such as
182/// visibility modifiers, parameters, return types, inheritance, and documentation.
183#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct ConstructMetadata {
185 pub visibility: Option<String>,
186 pub modifiers: Vec<String>,
187 pub parameters: Vec<Parameter>,
188 pub return_type: Option<String>,
189 pub inheritance: Vec<String>,
190 pub annotations: Vec<String>,
191 pub documentation: Option<String>,
192}
193
194/// Represents a parsed code construct (function, class, struct, etc.)
195///
196/// This is the core data structure that represents any identifiable code element
197/// found during parsing. It includes the construct's location, content, metadata,
198/// and hierarchical relationships with other constructs.
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct CodeConstruct {
201 pub node_type: String,
202 pub name: Option<String>,
203 pub source_code: String,
204 pub start_line: usize,
205 pub end_line: usize,
206 pub start_byte: usize,
207 pub end_byte: usize,
208 pub parent: Option<Box<CodeConstruct>>,
209 pub children: Vec<CodeConstruct>,
210 pub metadata: ConstructMetadata,
211}
212
213/// Represents a successfully parsed source code file
214///
215/// This struct contains all information extracted from a single file,
216/// including the parsed constructs, metadata, and performance metrics.
217#[derive(Debug, Clone, Serialize, Deserialize)]
218pub struct ParsedFile {
219 pub file_path: String,
220 pub relative_path: String,
221 pub language: Language,
222 pub constructs: Vec<CodeConstruct>,
223 #[serde(skip)]
224 pub syntax_tree: Option<Tree>,
225 pub file_size_bytes: usize,
226 pub parse_time_ms: u64,
227}
228
229/// Represents the results of parsing an entire project or directory
230///
231/// This struct aggregates the results of parsing multiple files,
232/// including success metrics, error information, and language distribution.
233#[derive(Debug, Clone, Serialize, Deserialize)]
234pub struct ParsedProject {
235 pub root_path: String,
236 pub files: Vec<ParsedFile>,
237 pub total_files_processed: usize,
238 pub processing_time_ms: u64,
239 pub language_distribution: HashMap<Language, usize>,
240 pub error_files: Vec<FileError>,
241}
242
243/// Filter criteria for selecting which files to parse
244///
245/// This struct allows you to specify various criteria for filtering files
246/// during directory parsing operations. All criteria are optional and are
247/// combined with AND logic when multiple criteria are specified.
248///
249/// # Examples
250///
251/// ```rust
252/// use tree_parser::{FileFilter, Language};
253/// use std::sync::Arc;
254///
255/// // Filter for Rust files only
256/// let filter = FileFilter {
257/// languages: Some(vec![Language::Rust]),
258/// extensions: None,
259/// min_size_bytes: None,
260/// max_size_bytes: None,
261/// custom_predicate: None,
262/// };
263///
264/// // Filter with custom logic
265/// let filter = FileFilter {
266/// languages: None,
267/// extensions: Some(vec!["rs".to_string(), "py".to_string()]),
268/// min_size_bytes: Some(100),
269/// max_size_bytes: Some(50_000),
270/// custom_predicate: Some(Arc::new(|path| {
271/// !path.to_string_lossy().contains("test")
272/// })),
273/// };
274/// ```
275#[derive(Clone)]
276pub struct FileFilter {
277 /// File extensions to include (e.g., ["rs", "py"]). None means all supported extensions.
278 pub extensions: Option<Vec<String>>,
279 /// Programming languages to include. None means all supported languages.
280 pub languages: Option<Vec<Language>>,
281 /// Minimum file size in bytes. Files smaller than this are excluded.
282 pub min_size_bytes: Option<usize>,
283 /// Maximum file size in bytes. Files larger than this are excluded.
284 pub max_size_bytes: Option<usize>,
285 /// Custom predicate function for advanced filtering logic
286 pub custom_predicate: Option<Arc<dyn Fn(&Path) -> bool + Send + Sync>>,
287}
288
289/// Configuration options for parsing operations
290///
291/// This struct provides extensive configuration options for controlling
292/// how files are parsed, including concurrency settings, file size limits,
293/// and language detection strategies.
294///
295/// # Examples
296///
297/// ```rust
298/// use tree_parser::{ParseOptions, LanguageDetection};
299///
300/// // Use default options
301/// let options = ParseOptions::default();
302///
303/// // Custom configuration
304/// let options = ParseOptions {
305/// max_concurrent_files: 8,
306/// include_hidden_files: false,
307/// max_file_size_mb: 5,
308/// recursive: true,
309/// ignore_patterns: vec!["target".to_string(), "node_modules".to_string()],
310/// language_detection: LanguageDetection::Combined,
311/// enable_caching: true,
312/// thread_pool_size: Some(4),
313/// };
314/// ```
315#[derive(Debug, Clone, Serialize, Deserialize)]
316pub struct ParseOptions {
317 /// Maximum number of files to parse concurrently (default: 2 * CPU cores)
318 pub max_concurrent_files: usize,
319 /// Whether to include hidden files (files starting with '.') in parsing
320 pub include_hidden_files: bool,
321 /// Maximum file size in megabytes to parse (larger files are skipped)
322 pub max_file_size_mb: usize,
323 /// Whether to recursively parse subdirectories
324 pub recursive: bool,
325 /// Patterns to ignore during directory traversal (supports glob patterns)
326 pub ignore_patterns: Vec<String>,
327 /// Strategy for detecting the programming language of files
328 pub language_detection: LanguageDetection,
329 /// Whether to enable internal caching for improved performance
330 pub enable_caching: bool,
331 /// Optional thread pool size (None uses system default)
332 pub thread_pool_size: Option<usize>,
333}
334
335impl Default for ParseOptions {
336 fn default() -> Self {
337 Self {
338 max_concurrent_files: num_cpus::get() * 2,
339 include_hidden_files: false,
340 max_file_size_mb: 10,
341 recursive: true,
342 ignore_patterns: vec![
343 "node_modules".to_string(),
344 ".git".to_string(),
345 "target".to_string(),
346 "build".to_string(),
347 ],
348 language_detection: LanguageDetection::ByExtension,
349 enable_caching: true,
350 thread_pool_size: None, // Uses system default
351 }
352 }
353}
354
355// Core API functions will be implemented in separate modules
356mod parser;
357mod search;
358mod utils;
359
360pub use parser::*;
361pub use search::*;
362pub use utils::*;
363// pub use test_compile::*; // Commented out as not currently used
364
365#[cfg(test)]
366mod tests {
367 use super::*;
368
369 #[test]
370 fn test_language_detection() {
371 assert_eq!(detect_language_by_extension("test.py"), Some(Language::Python));
372 assert_eq!(detect_language_by_extension("test.rs"), Some(Language::Rust));
373 assert_eq!(detect_language_by_extension("test.js"), Some(Language::JavaScript));
374 }
375}