tree_parser/lib.rs
1//! # Tree Parser Library
2//!
3//! A comprehensive Rust library for parsing and searching code elements across multiple programming languages
4//! using tree-sitter. This library provides powerful tools for static code analysis, code search, and AST manipulation.
5//!
6//! ## Features
7//!
8//! - **Multi-language Support**: Parse Python, Rust, JavaScript, TypeScript, Java, C, C++, Go, and more
9//! - **High Performance**: Concurrent parsing with async/await for maximum efficiency
10//! - **Advanced Search**: Find functions, classes, structs, interfaces with regex pattern matching
11//! - **Flexible Filtering**: Custom file filters and parsing options
12//! - **Rich Metadata**: Extract detailed information about code constructs
13//! - **Type Safety**: Full Rust type safety with comprehensive error handling
14//! - **Configurable**: Extensive configuration options for different use cases
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use tree_parser::{parse_file, Language};
20//!
21//! #[tokio::main]
22//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! // Parse a single file
24//! let parsed_file = parse_file("src/main.rs", Language::Rust).await?;
25//!
26//! println!("Found {} constructs", parsed_file.constructs.len());
27//! for construct in &parsed_file.constructs {
28//! if let Some(name) = &construct.name {
29//! println!("{}: {} (lines {}-{})",
30//! construct.node_type, name,
31//! construct.start_line, construct.end_line);
32//! }
33//! }
34//!
35//! Ok(())
36//! }
37//! ```
38//!
39//! ## Finding Code Constructs
40//!
41//! This library provides several powerful methods to search for specific code constructs:
42//!
43//! ### 1. Search by Node Type
44//!
45//! ```rust
46//! use tree_parser::{parse_file, search_by_node_type, Language};
47//!
48//! #[tokio::main]
49//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
50//! let parsed_file = parse_file("example.py", Language::Python).await?;
51//!
52//! // Find all function definitions
53//! let functions = search_by_node_type(&parsed_file, "function_definition", None);
54//!
55//! // Find test functions using regex
56//! let test_functions = search_by_node_type(&parsed_file, "function_definition", Some(r"^test_.*"));
57//!
58//! println!("Found {} functions, {} are tests", functions.len(), test_functions.len());
59//! Ok(())
60//! }
61//! ```
62//!
63//! ### 2. Search by Multiple Node Types
64//!
65//! ```rust
66//! use tree_parser::{parse_file, search_by_multiple_node_types, Language};
67//!
68//! #[tokio::main]
69//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
70//! let parsed_file = parse_file("example.js", Language::JavaScript).await?;
71//!
72//! // Find all function-like constructs
73//! let functions = search_by_multiple_node_types(
74//! &parsed_file,
75//! &["function_declaration", "function_expression", "arrow_function"],
76//! None
77//! );
78//!
79//! println!("Found {} function-like constructs", functions.len());
80//! Ok(())
81//! }
82//! ```
83//!
84//! ### 3. Advanced Search with Tree-sitter Queries
85//!
86//! ```rust
87//! use tree_parser::{parse_file, search_by_query, Language};
88//!
89//! #[tokio::main]
90//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
91//! let parsed_file = parse_file("example.py", Language::Python).await?;
92//!
93//! // Find all class definitions with their methods
94//! let query = r#"
95//! (class_definition
96//! name: (identifier) @class_name
97//! body: (block
98//! (function_definition
99//! name: (identifier) @method_name)))
100//! "#;
101//!
102//! let classes_with_methods = search_by_query(&parsed_file, query)?;
103//! println!("Found {} classes with methods", classes_with_methods.len());
104//! Ok(())
105//! }
106//! ```
107//!
108//! ## Finding Node Types
109//!
110//! To effectively search for code constructs, you need to know the correct node types.
111//! Here are the most common node types by language:
112//!
113//! ### Python
114//! - `function_definition` - Function definitions
115//! - `class_definition` - Class definitions
116//! - `import_statement` - Import statements
117//! - `decorated_definition` - Functions/classes with decorators
118//! - `assignment` - Variable assignments
119//!
120//! ### Rust
121//! - `function_item` - Function definitions
122//! - `struct_item` - Struct definitions
123//! - `impl_item` - Implementation blocks
124//! - `trait_item` - Trait definitions
125//! - `enum_item` - Enum definitions
126//! - `mod_item` - Module definitions
127//!
128//! ### JavaScript/TypeScript
129//! - `function_declaration` - Function declarations
130//! - `function_expression` - Function expressions
131//! - `arrow_function` - Arrow functions
132//! - `method_definition` - Class methods
133//! - `class_declaration` - Class declarations
134//!
135//! ### Java
136//! - `method_declaration` - Method definitions
137//! - `class_declaration` - Class declarations
138//! - `interface_declaration` - Interface declarations
139//! - `constructor_declaration` - Constructor definitions
140//!
141//! For a complete list of node types, inspect your parsed files or consult the
142//! tree-sitter grammar documentation for your target language.
143//!
144//! ### Discovering Node Types
145//!
146//! ```rust
147//! use tree_parser::{parse_file, Language};
148//! use std::collections::HashSet;
149//!
150//! #[tokio::main]
151//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
152//! let parsed_file = parse_file("your_file.py", Language::Python).await?;
153//!
154//! // Collect all unique node types
155//! let mut node_types: HashSet<String> = HashSet::new();
156//! for construct in &parsed_file.constructs {
157//! node_types.insert(construct.node_type.clone());
158//! }
159//!
160//! println!("Available node types:");
161//! for node_type in &node_types {
162//! println!(" - {}", node_type);
163//! }
164//!
165//! Ok(())
166//! }
167//! ```
168//!
169//! ### 4. Online Tree-sitter Playground
170//!
171//! Use the [Tree-sitter Playground](https://tree-sitter.github.io/tree-sitter/playground) to:
172//! 1. Paste your code
173//! 2. Select the appropriate language
174//! 3. Explore the generated syntax tree
175//! 4. Identify the exact node types you need
176//!
177//! ## Best Practices
178//!
179//! ### Performance Optimization
180//! - Increase `max_concurrent_files` for better performance on multi-core systems
181//! - Use file filters to exclude unnecessary files (node_modules, target, .git, etc.)
182//! - Set appropriate `max_file_size_mb` limits to skip very large files
183//! - Enable caching with `enable_caching: true` for repeated operations
184//! - Use `LanguageDetection::ByExtension` for faster processing
185//!
186//! ### Memory Management
187//! - Set `syntax_tree: None` after extracting constructs if you don't need the tree
188//! - Process files in batches rather than loading entire projects
189//! - Use streaming approaches for very large codebases
190//!
191//! ### Error Handling
192//! - Always check `project.error_files` for individual file parsing errors
193//! - Handle different `ErrorType` variants appropriately
194//! - Use proper error propagation with `?` operator
195//!
196//! ## Troubleshooting
197//!
198//! **Common Issues:**
199//! - "Unsupported language" error: Enable correct feature flags in Cargo.toml
200//! - "Parse error" for valid code: Check for syntax errors or unsupported language features
201//! - Poor performance: Increase concurrency, use filters, enable caching
202//! - Memory issues: Drop syntax trees after use, process in batches
203//! - Missing constructs: Verify node type names, check nesting, use tree-sitter queries
204
205use std::collections::HashMap;
206use std::path::Path;
207use std::sync::Arc;
208
209use serde::{Deserialize, Serialize};
210use thiserror::Error;
211use tree_sitter::Tree;
212
213// Re-export commonly used types
214pub use tree_sitter::{Point, Range};
215
216// Language modules
217mod languages;
218pub use languages::*;
219
220/// Main error type for the tree parser library
221///
222/// This enum represents all possible errors that can occur during parsing operations.
223/// All variants are serializable and provide detailed error information.
224#[derive(Error, Debug, Clone, Serialize, Deserialize)]
225pub enum Error {
226 #[error("IO error: {0}")]
227 Io(String),
228 #[error("Parse error: {0}")]
229 Parse(String),
230 #[error("Unsupported language: {0}")]
231 UnsupportedLanguage(String),
232 #[error("File too large: {0} bytes")]
233 FileTooLarge(usize),
234 #[error("Permission denied: {0}")]
235 PermissionDenied(String),
236 #[error("Invalid query: {0}")]
237 InvalidQuery(String),
238}
239
240/// Categorizes different types of errors for easier handling
241///
242/// This enum is used to classify errors into broad categories, making it easier
243/// to implement different error handling strategies for different error types.
244#[derive(Debug, Clone, Serialize, Deserialize)]
245pub enum ErrorType {
246 ParseError,
247 IoError,
248 UnsupportedLanguage,
249 FileTooLarge,
250 PermissionDenied,
251}
252
253/// Represents an error that occurred while processing a specific file
254///
255/// This struct contains detailed information about parsing failures,
256/// including the file path, error type, and a descriptive message.
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct FileError {
259 pub file_path: String,
260 pub error_type: ErrorType,
261 pub message: String,
262}
263
264/// Supported programming languages
265///
266/// This enum represents all programming languages that the tree parser can handle.
267/// Each language corresponds to a specific tree-sitter grammar.
268///
269/// # Feature Flags
270///
271/// Most languages are gated behind feature flags to reduce compilation time and binary size:
272/// - `python` - Python support
273/// - `rust_lang` - Rust support
274/// - `javascript` - JavaScript support
275/// - `typescript` - TypeScript support
276/// - `java` - Java support
277/// - `c` - C support
278/// - `cpp` - C++ support
279/// - `go` - Go support
280/// - `full` - All languages
281#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
282pub enum Language {
283 Python,
284 Rust,
285 JavaScript,
286 TypeScript,
287 Java,
288 C,
289 Cpp,
290 Go,
291 CSharp,
292 Php,
293 Ruby,
294 Swift,
295 Kotlin,
296 Scala,
297 Haskell,
298 Lua,
299 Perl,
300 R,
301 Bash,
302 PowerShell,
303 Html,
304 Css,
305 Sql,
306 Json,
307 Yaml,
308 Toml,
309 Xml,
310}
311
312/// Methods for detecting the programming language of a file
313///
314/// This enum defines different strategies for automatically determining
315/// the programming language of a source code file.
316#[derive(Debug, Clone, Serialize, Deserialize)]
317pub enum LanguageDetection {
318 ByExtension,
319 ByContent,
320 ByShebang,
321 Combined, // Uses all methods with fallback priority
322}
323
324/// Represents a function or method parameter
325///
326/// This struct contains detailed information about a parameter including
327/// its name, type, default value, and whether it's variadic.
328#[derive(Debug, Clone, Serialize, Deserialize)]
329pub struct Parameter {
330 pub name: String,
331 pub param_type: Option<String>,
332 pub default_value: Option<String>,
333 pub is_variadic: bool,
334}
335
336/// Metadata associated with a code construct
337///
338/// This struct contains additional information about code constructs such as
339/// visibility modifiers, parameters, return types, inheritance, and documentation.
340#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct ConstructMetadata {
342 pub visibility: Option<String>,
343 pub modifiers: Vec<String>,
344 pub parameters: Vec<Parameter>,
345 pub return_type: Option<String>,
346 pub inheritance: Vec<String>,
347 pub annotations: Vec<String>,
348 pub documentation: Option<String>,
349}
350
351/// Represents a parsed code construct (function, class, struct, etc.)
352///
353/// This is the core data structure that represents any identifiable code element
354/// found during parsing. It includes the construct's location, content, metadata,
355/// and hierarchical relationships with other constructs.
356#[derive(Debug, Clone, Serialize, Deserialize)]
357pub struct CodeConstruct {
358 pub node_type: String,
359 pub name: Option<String>,
360 pub source_code: String,
361 pub start_line: usize,
362 pub end_line: usize,
363 pub start_byte: usize,
364 pub end_byte: usize,
365 pub parent: Option<Box<CodeConstruct>>,
366 pub children: Vec<CodeConstruct>,
367 pub metadata: ConstructMetadata,
368}
369
370/// Represents a successfully parsed source code file
371///
372/// This struct contains all information extracted from a single file,
373/// including the parsed constructs, metadata, and performance metrics.
374#[derive(Debug, Clone, Serialize, Deserialize)]
375pub struct ParsedFile {
376 pub file_path: String,
377 pub relative_path: String,
378 pub language: Language,
379 pub constructs: Vec<CodeConstruct>,
380 #[serde(skip)]
381 pub syntax_tree: Option<Tree>,
382 pub file_size_bytes: usize,
383}
384
385/// Represents the results of parsing an entire project or directory
386///
387/// This struct aggregates the results of parsing multiple files,
388/// including success metrics, error information, and language distribution.
389#[derive(Debug, Clone, Serialize, Deserialize)]
390pub struct ParsedProject {
391 pub root_path: String,
392 pub files: Vec<ParsedFile>,
393 pub total_files_processed: usize,
394 pub language_distribution: HashMap<Language, usize>,
395 pub error_files: Vec<FileError>,
396}
397
398/// Filter criteria for selecting which files to parse
399///
400/// This struct allows you to specify various criteria for filtering files
401/// during directory parsing operations. All criteria are optional and are
402/// combined with AND logic when multiple criteria are specified.
403///
404/// # Examples
405///
406/// ```rust
407/// use tree_parser::{FileFilter, Language};
408/// use std::sync::Arc;
409///
410/// // Filter for Rust files only
411/// let filter = FileFilter {
412/// languages: Some(vec![Language::Rust]),
413/// extensions: None,
414/// min_size_bytes: None,
415/// max_size_bytes: None,
416/// custom_predicate: None,
417/// };
418///
419/// // Filter with custom logic
420/// let filter = FileFilter {
421/// languages: None,
422/// extensions: Some(vec!["rs".to_string(), "py".to_string()]),
423/// min_size_bytes: Some(100),
424/// max_size_bytes: Some(50_000),
425/// custom_predicate: Some(Arc::new(|path| {
426/// !path.to_string_lossy().contains("test")
427/// })),
428/// };
429/// ```
430#[derive(Clone)]
431pub struct FileFilter {
432 /// File extensions to include (e.g., ["rs", "py"]). None means all supported extensions.
433 pub extensions: Option<Vec<String>>,
434 /// Programming languages to include. None means all supported languages.
435 pub languages: Option<Vec<Language>>,
436 /// Minimum file size in bytes. Files smaller than this are excluded.
437 pub min_size_bytes: Option<usize>,
438 /// Maximum file size in bytes. Files larger than this are excluded.
439 pub max_size_bytes: Option<usize>,
440 /// Custom predicate function for advanced filtering logic
441 pub custom_predicate: Option<Arc<dyn Fn(&Path) -> bool + Send + Sync>>,
442}
443
444/// Configuration options for parsing operations
445///
446/// This struct provides extensive configuration options for controlling
447/// how files are parsed, including concurrency settings, file size limits,
448/// and language detection strategies.
449///
450/// # Examples
451///
452/// ```rust
453/// use tree_parser::{ParseOptions, LanguageDetection};
454///
455/// // Use default options
456/// let options = ParseOptions::default();
457///
458/// // Custom configuration
459/// let options = ParseOptions {
460/// max_concurrent_files: 8,
461/// include_hidden_files: false,
462/// max_file_size_mb: 5,
463/// recursive: true,
464/// ignore_patterns: vec!["target".to_string(), "node_modules".to_string()],
465/// language_detection: LanguageDetection::Combined,
466/// enable_caching: true,
467/// thread_pool_size: Some(4),
468/// };
469/// ```
470#[derive(Debug, Clone, Serialize, Deserialize)]
471pub struct ParseOptions {
472 /// Maximum number of files to parse concurrently (default: 2 * CPU cores)
473 pub max_concurrent_files: usize,
474 /// Whether to include hidden files (files starting with '.') in parsing
475 pub include_hidden_files: bool,
476 /// Maximum file size in megabytes to parse (larger files are skipped)
477 pub max_file_size_mb: usize,
478 /// Whether to recursively parse subdirectories
479 pub recursive: bool,
480 /// Patterns to ignore during directory traversal (supports glob patterns)
481 pub ignore_patterns: Vec<String>,
482 /// Strategy for detecting the programming language of files
483 pub language_detection: LanguageDetection,
484 /// Whether to enable internal caching for improved performance
485 pub enable_caching: bool,
486 /// Optional thread pool size (None uses system default)
487 pub thread_pool_size: Option<usize>,
488}
489
490impl Default for ParseOptions {
491 fn default() -> Self {
492 Self {
493 max_concurrent_files: num_cpus::get() * 2,
494 include_hidden_files: false,
495 max_file_size_mb: 10,
496 recursive: true,
497 ignore_patterns: vec![
498 "node_modules".to_string(),
499 ".git".to_string(),
500 "target".to_string(),
501 "build".to_string(),
502 ],
503 language_detection: LanguageDetection::ByExtension,
504 enable_caching: true,
505 thread_pool_size: None, // Uses system default
506 }
507 }
508}
509
510// Core API functions will be implemented in separate modules
511mod parser;
512mod search;
513mod utils;
514
515pub use parser::*;
516pub use search::*;
517pub use utils::*;
518// pub use test_compile::*; // Commented out as not currently used
519
520#[cfg(test)]
521mod tests {
522 use super::*;
523
524 #[test]
525 fn test_language_detection() {
526 assert_eq!(detect_language_by_extension("test.py"), Some(Language::Python));
527 assert_eq!(detect_language_by_extension("test.rs"), Some(Language::Rust));
528 assert_eq!(detect_language_by_extension("test.js"), Some(Language::JavaScript));
529 }
530}