use crate::{
languages::*, CodeConstruct, ConstructMetadata, Error, ErrorType, FileError, Language,
LanguageDetection, ParseOptions, ParsedFile, ParsedProject,
};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use tokio::fs;
use tree_sitter::{Node, Parser, Tree};
use walkdir::WalkDir;
pub async fn parse_file(file_path: &str, language: Language) -> Result<ParsedFile, Error> {
let content = fs::read_to_string(file_path)
.await
.map_err(|e| Error::Io(e.to_string()))?;
let file_size_bytes = content.len();
let ts_language = get_tree_sitter_language(&language)?;
let mut parser = Parser::new();
parser
.set_language(&ts_language)
.map_err(|e| Error::Parse(e.to_string()))?;
let tree = parser
.parse(&content, None)
.ok_or_else(|| Error::Parse("Failed to parse file".to_string()))?;
let constructs = extract_constructs(&tree, &content, &language);
let path = Path::new(file_path);
let relative_path = path
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string();
Ok(ParsedFile {
file_path: file_path.to_string(),
relative_path,
language,
constructs,
syntax_tree: Some(tree),
file_size_bytes,
})
}
pub async fn parse_directory(
dir_path: &str,
options: ParseOptions,
) -> Result<ParsedProject, Error> {
let root_path = PathBuf::from(dir_path);
if !root_path.exists() {
return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
}
let files_to_parse = collect_files(&root_path, &options)?;
let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
let total_files_processed = parsed_files.len();
let mut language_distribution = HashMap::new();
for file in &parsed_files {
*language_distribution.entry(file.language.clone()).or_insert(0) += 1;
}
Ok(ParsedProject {
root_path: dir_path.to_string(),
files: parsed_files,
total_files_processed,
language_distribution,
error_files,
})
}
pub async fn parse_directory_with_filter(
dir_path: &str,
file_filter: &crate::FileFilter,
options: ParseOptions,
) -> Result<ParsedProject, Error> {
let root_path = PathBuf::from(dir_path);
if !root_path.exists() {
return Err(Error::Io(format!("Directory does not exist: {}", dir_path)));
}
let files_to_parse = collect_files_with_filter(&root_path, &options, file_filter)?;
let (parsed_files, error_files) = parse_files_parallel(files_to_parse, &options).await;
let total_files_processed = parsed_files.len();
let mut language_distribution = HashMap::new();
for file in &parsed_files {
*language_distribution.entry(file.language.clone()).or_insert(0) += 1;
}
Ok(ParsedProject {
root_path: dir_path.to_string(),
files: parsed_files,
total_files_processed,
language_distribution,
error_files,
})
}
fn collect_files(root_path: &Path, options: &ParseOptions) -> Result<Vec<PathBuf>, Error> {
let mut files = Vec::new();
let walker = if options.recursive {
WalkDir::new(root_path)
} else {
WalkDir::new(root_path).max_depth(1)
};
for entry in walker {
let entry = entry.map_err(|e| Error::Io(e.to_string()))?;
let path = entry.path();
if path.is_dir() {
continue;
}
if !options.include_hidden_files && is_hidden_file(path) {
continue;
}
if should_ignore_file(path, &options.ignore_patterns) {
continue;
}
if let Ok(metadata) = path.metadata() {
let size_mb = metadata.len() as usize / (1024 * 1024);
if size_mb > options.max_file_size_mb {
continue;
}
}
if detect_language_by_extension(&path.to_string_lossy()).is_some() {
files.push(path.to_path_buf());
}
}
Ok(files)
}
fn collect_files_with_filter(
root_path: &Path,
options: &ParseOptions,
filter: &crate::FileFilter,
) -> Result<Vec<PathBuf>, Error> {
let mut files = collect_files(root_path, options)?;
files.retain(|path| {
if let Some(ref extensions) = filter.extensions {
if let Some(ext) = path.extension() {
if !extensions.contains(&ext.to_string_lossy().to_lowercase()) {
return false;
}
} else {
return false;
}
}
if let Some(ref languages) = filter.languages {
if let Some(detected_lang) = detect_language_by_extension(&path.to_string_lossy()) {
if !languages.contains(&detected_lang) {
return false;
}
} else {
return false;
}
}
if let Ok(metadata) = path.metadata() {
let size = metadata.len() as usize;
if let Some(min_size) = filter.min_size_bytes {
if size < min_size {
return false;
}
}
if let Some(max_size) = filter.max_size_bytes {
if size > max_size {
return false;
}
}
}
if let Some(ref predicate) = filter.custom_predicate {
if !predicate(path) {
return false;
}
}
true
});
Ok(files)
}
async fn parse_files_parallel(
files: Vec<PathBuf>,
options: &ParseOptions,
) -> (Vec<ParsedFile>, Vec<FileError>) {
let chunk_size = std::cmp::max(1, files.len() / options.max_concurrent_files);
let mut parsed_files = Vec::new();
let mut error_files = Vec::new();
for chunk in files.chunks(chunk_size) {
let chunk_results: Vec<_> = chunk
.iter()
.map(|path| async move {
let path_str = path.to_string_lossy().to_string();
let language = match options.language_detection {
LanguageDetection::ByExtension => detect_language_by_extension(&path_str),
LanguageDetection::Combined => {
if let Ok(content) = tokio::fs::read_to_string(path).await {
detect_language(&path_str, Some(&content))
} else {
detect_language_by_extension(&path_str)
}
}
_ => detect_language_by_extension(&path_str), };
if let Some(lang) = language {
match parse_file(&path_str, lang).await {
Ok(parsed) => Ok(parsed),
Err(e) => Err(FileError {
file_path: path_str,
error_type: ErrorType::ParseError,
message: e.to_string(),
}),
}
} else {
Err(FileError {
file_path: path_str,
error_type: ErrorType::UnsupportedLanguage,
message: "Could not detect language".to_string(),
})
}
})
.collect();
for result in futures::future::join_all(chunk_results).await {
match result {
Ok(parsed_file) => parsed_files.push(parsed_file),
Err(error) => error_files.push(error),
}
}
}
(parsed_files, error_files)
}
fn extract_constructs(tree: &Tree, source: &str, language: &Language) -> Vec<CodeConstruct> {
let root_node = tree.root_node();
let mut root_constructs = Vec::new();
extract_constructs_hierarchical(root_node, source, language, &mut root_constructs, None);
let mut all_constructs = Vec::new();
flatten_constructs(&root_constructs, &mut all_constructs);
all_constructs
}
fn extract_constructs_hierarchical(
node: Node,
source: &str,
language: &Language,
constructs: &mut Vec<CodeConstruct>,
parent_construct: Option<&CodeConstruct>,
) {
let node_type = node.kind();
let supported_types = get_supported_node_types(language);
if supported_types.contains(&node_type.to_string()) {
let mut construct = create_code_construct_with_parent(node, source, language, parent_construct);
let mut child_constructs = Vec::new();
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
extract_constructs_hierarchical(child, source, language, &mut child_constructs, Some(&construct));
}
}
construct.children = child_constructs;
constructs.push(construct);
} else {
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
extract_constructs_hierarchical(child, source, language, constructs, parent_construct);
}
}
}
}
fn flatten_constructs(constructs: &[CodeConstruct], flattened: &mut Vec<CodeConstruct>) {
for construct in constructs {
flattened.push(construct.clone());
flatten_constructs(&construct.children, flattened);
}
}
fn create_code_construct_with_parent(
node: Node,
source: &str,
language: &Language,
parent_construct: Option<&CodeConstruct>
) -> CodeConstruct {
let start_byte = node.start_byte();
let end_byte = node.end_byte();
let source_code = source[start_byte..end_byte].to_string();
let start_point = node.start_position();
let end_point = node.end_position();
let name = extract_construct_name(node, source);
let metadata = extract_metadata(node, source, language);
let parent = parent_construct.map(|p| Box::new(p.clone()));
CodeConstruct {
node_type: node.kind().to_string(),
name,
source_code,
start_line: start_point.row + 1, end_line: end_point.row + 1,
start_byte,
end_byte,
parent,
children: Vec::new(), metadata,
}
}
fn extract_construct_name(node: Node, source: &str) -> Option<String> {
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
if child.kind() == "identifier" || child.kind() == "name" {
let start = child.start_byte();
let end = child.end_byte();
return Some(source[start..end].to_string());
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Language;
#[test]
fn test_parent_child_relationships() {
let source = "class TestClass:\n def test_method(self):\n pass";
let mut parser = Parser::new();
let language = crate::languages::get_tree_sitter_language(&Language::Python).unwrap();
parser.set_language(&language).unwrap();
let tree = parser.parse(source, None).unwrap();
let constructs = extract_constructs(&tree, source, &Language::Python);
let class_construct = constructs.iter().find(|c| c.node_type == "class_definition");
let method_construct = constructs.iter().find(|c| c.node_type == "function_definition");
assert!(class_construct.is_some(), "Should find class construct");
assert!(method_construct.is_some(), "Should find method construct");
let method = method_construct.unwrap();
assert!(method.parent.is_some(), "Method should have a parent");
if let Some(parent) = &method.parent {
assert_eq!(parent.node_type, "class_definition", "Method's parent should be the class");
}
let class = class_construct.unwrap();
assert!(!class.children.is_empty(), "Class should have children");
let child_method = class.children.iter().find(|c| c.node_type == "function_definition");
assert!(child_method.is_some(), "Class should contain the method as a child");
}
}
fn extract_metadata(_node: Node, _source: &str, _language: &Language) -> ConstructMetadata {
ConstructMetadata {
visibility: None,
modifiers: Vec::new(),
parameters: Vec::new(),
return_type: None,
inheritance: Vec::new(),
annotations: Vec::new(),
documentation: None,
}
}
fn is_hidden_file(path: &Path) -> bool {
path.file_name()
.and_then(|name| name.to_str())
.map(|name| name.starts_with('.'))
.unwrap_or(false)
}
fn should_ignore_file(path: &Path, ignore_patterns: &[String]) -> bool {
let path_str = path.to_string_lossy();
for pattern in ignore_patterns {
if path_str.contains(pattern) {
return true;
}
}
false
}