use super::Symbol;
use crate::parser::read_file_content;
use regex::Regex;
use std::collections::HashMap;
use std::path::Path;
pub struct SymbolExtractor {
language_extractors: HashMap<String, LanguageExtractor>,
}
#[derive(Clone)]
struct LanguageExtractor {
function_patterns: Vec<Regex>,
class_patterns: Vec<Regex>,
method_patterns: Vec<Regex>,
variable_patterns: Vec<Regex>,
import_patterns: Vec<Regex>,
visibility_keywords: HashMap<String, super::SymbolVisibility>,
}
impl SymbolExtractor {
pub fn new() -> Self {
let mut language_extractors = HashMap::new();
let rust_extractor = LanguageExtractor {
function_patterns: vec![
Regex::new(r"(?m)^(pub\s+)?(async\s+)?(unsafe\s+)?fn\s+(\w+)\s*(<[^>]*>)?\s*\(([^)]*)\)").unwrap(),
Regex::new(r"(?m)^(pub\s+)?(async\s+)?(unsafe\s+)?fn\s+(\w+)").unwrap(),
],
class_patterns: vec![
Regex::new(r"(?m)^(pub\s+)?struct\s+(\w+)\s*(<[^>]*>)?").unwrap(),
Regex::new(r"(?m)^(pub\s+)?enum\s+(\w+)\s*(<[^>]*>)?").unwrap(),
Regex::new(r"(?m)^(pub\s+)?trait\s+(\w+)\s*(<[^>]*>)?").unwrap(),
Regex::new(r"(?m)^type\s+(\w+)\s*=").unwrap(),
],
method_patterns: vec![
Regex::new(r"(?m)^\s*(pub\s+)?(async\s+)?(unsafe\s+)?fn\s+(\w+)\s*(<[^>]*>)?\s*\(&?self").unwrap(),
],
variable_patterns: vec![
Regex::new(r"(?m)^(pub\s+)?(const|static)\s+(\w+)\s*:\s*").unwrap(),
],
import_patterns: vec![
Regex::new(r"use\s+([^;]+);").unwrap(),
Regex::new(r"extern\s+crate\s+(\w+);").unwrap(),
Regex::new(r"mod\s+(\w+);").unwrap(),
],
visibility_keywords: {
let mut map = HashMap::new();
map.insert("pub".to_string(), super::SymbolVisibility::Public);
map.insert("pub(crate)".to_string(), super::SymbolVisibility::Internal);
map.insert("pub(super)".to_string(), super::SymbolVisibility::Private);
map
},
};
language_extractors.insert("rs".to_string(), rust_extractor);
let python_extractor = LanguageExtractor {
function_patterns: vec![
Regex::new(r"(?m)^(def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*([^:]+))?:)").unwrap(),
Regex::new(r"(?m)^(async\s+def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*([^:]+))?:)").unwrap(),
Regex::new(r"(?m)^(class\s+(\w+)(?:\s*\([^)]*\))?:)").unwrap(),
],
class_patterns: vec![
Regex::new(r"(?m)^(class\s+(\w+)(?:\s*\([^)]*\))?:)").unwrap(),
],
method_patterns: vec![
Regex::new(r"(?m)^\s+(def\s+(\w+)\s*\(&?self)").unwrap(),
],
variable_patterns: vec![
Regex::new(r"(?m)^[A-Z_]+\s*=").unwrap(),
],
import_patterns: vec![
Regex::new(r"import\s+([^#\n]+)").unwrap(),
Regex::new(r"from\s+([\w.]+)\s+import").unwrap(),
],
visibility_keywords: HashMap::new(),
};
language_extractors.insert("py".to_string(), python_extractor.clone());
language_extractors.insert("pyw".to_string(), python_extractor.clone());
language_extractors.insert("pyi".to_string(), python_extractor);
let js_extractor = LanguageExtractor {
function_patterns: vec![
Regex::new(r"(?m)^(?:export\s+(?:default\s+)?)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)(?:\s*:\s*([^={]+))?").unwrap(),
Regex::new(r"(?m)^(?:export\s+(?:default\s+)?)?(?:const|let|var)\s+(\w+)\s*(?:<[^>]*>)?\s*=\s*(?:async\s+)?\(([^)]*)\)(?:\s*:\s*([^=]+))?\s*=>").unwrap(),
Regex::new(r"(?m)^(?:export\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([^{]+))?").unwrap(),
Regex::new(r"(?m)^(?:export\s+)?interface\s+(\w+)(?:\s+extends\s+([^{]+))?").unwrap(),
Regex::new(r"(?m)^type\s+(\w+)\s*=").unwrap(),
],
class_patterns: vec![
Regex::new(r"(?m)^(?:export\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([^{]+))?").unwrap(),
Regex::new(r"(?m)^(?:export\s+)?interface\s+(\w+)(?:\s+extends\s+([^{]+))?").unwrap(),
Regex::new(r"(?m)^type\s+(\w+)\s*=").unwrap(),
],
method_patterns: vec![
Regex::new(r"(?m)^\s+(?:public|private|protected)?\s*(?:async\s+)?(\w+)\s*(?:<[^>]*>)?\s*\(([^)]*)\)(?:\s*:\s*[^{]+)?\s*\{").unwrap(),
],
variable_patterns: vec![
Regex::new(r"(?m)^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*:\s*([^=]+)").unwrap(),
],
import_patterns: vec![
Regex::new(r#"import\s+(?:\{([^}]+)\}|\*\s+as\s+(\w+)|(\w+))\s+from\s+['"]([^'"]+)['"]"#).unwrap(),
Regex::new(r#"import\s+['"]([^'"]+)['"]"#).unwrap(),
Regex::new(r#"require\s*\(\s*['"]([^'"]+)['"]\s*\)"#).unwrap(),
],
visibility_keywords: {
let mut map = HashMap::new();
map.insert("public".to_string(), super::SymbolVisibility::Public);
map.insert("private".to_string(), super::SymbolVisibility::Private);
map.insert("protected".to_string(), super::SymbolVisibility::Protected);
map
},
};
language_extractors.insert("js".to_string(), js_extractor.clone());
language_extractors.insert("jsx".to_string(), js_extractor.clone());
language_extractors.insert("ts".to_string(), js_extractor.clone());
language_extractors.insert("tsx".to_string(), js_extractor.clone());
language_extractors.insert("mjs".to_string(), js_extractor.clone());
language_extractors.insert("cjs".to_string(), js_extractor);
Self {
language_extractors,
}
}
pub fn extract_from_file(&self, file_path: &Path) -> Result<Vec<Symbol>, Box<dyn std::error::Error>> {
let content = read_file_content(file_path.to_string_lossy().as_ref());
let ext = file_path.extension()
.and_then(|e| e.to_str())
.unwrap_or("");
self.extract_from_content(&content, file_path, ext)
}
pub fn extract_from_content(
&self,
content: &str,
file_path: &Path,
extension: &str,
) -> Result<Vec<Symbol>, Box<dyn std::error::Error>> {
let mut symbols = Vec::new();
let extractor = self.language_extractors.get(extension);
if let Some(extractor) = extractor {
for (idx, pattern) in extractor.function_patterns.iter().enumerate() {
for cap in pattern.captures_iter(content) {
if let Some(symbol) = self.extract_function(&cap, file_path, content, extension) {
symbols.push(symbol);
}
}
}
for pattern in &extractor.class_patterns {
for cap in pattern.captures_iter(content) {
if let Some(symbol) = self.extract_class(&cap, file_path, content, extension) {
symbols.push(symbol);
}
}
}
for pattern in &extractor.variable_patterns {
for cap in pattern.captures_iter(content) {
if let Some(symbol) = self.extract_variable(&cap, file_path, content, extension) {
symbols.push(symbol);
}
}
}
}
let mut seen = std::collections::HashSet::new();
symbols.retain(|s| seen.insert(s.id.clone()));
Ok(symbols)
}
fn extract_function(
&self,
cap: ®ex::Captures,
file_path: &Path,
content: &str,
extension: &str,
) -> Option<Symbol> {
let name = cap.get(4)?.as_str().to_string();
let full_match = cap.get(0)?;
let line_num = content[..full_match.start()].lines().count() + 1;
let signature = cap.get(0)?.as_str().to_string();
Some(Symbol {
id: super::Symbol::create_id(
&file_path.to_string_lossy(),
&name,
&super::SymbolKind::Function,
line_num,
),
name,
kind: super::SymbolKind::Function,
file_path: file_path.to_string_lossy().to_string(),
line: line_num,
column: 0,
end_line: line_num,
signature,
documentation: None,
visibility: super::SymbolVisibility::Public,
parent: None,
type_info: None,
generics: vec![],
annotations: vec![],
attributes: vec![],
metadata: HashMap::new(),
})
}
fn extract_class(
&self,
cap: ®ex::Captures,
file_path: &Path,
content: &str,
extension: &str,
) -> Option<Symbol> {
let name = cap
.get(2)
.or_else(|| cap.get(1))
?.as_str()
.to_string();
let full_match = cap.get(0)?;
let line_num = content[..full_match.start()].lines().count() + 1;
let signature = cap.get(0)?.as_str().to_string();
Some(Symbol {
id: super::Symbol::create_id(
&file_path.to_string_lossy(),
&name,
&super::SymbolKind::Class,
line_num,
),
name,
kind: super::SymbolKind::Class,
file_path: file_path.to_string_lossy().to_string(),
line: line_num,
column: 0,
end_line: line_num,
signature,
documentation: None,
visibility: super::SymbolVisibility::Public,
parent: None,
type_info: None,
generics: vec![],
annotations: vec![],
attributes: vec![],
metadata: HashMap::new(),
})
}
fn extract_variable(
&self,
cap: ®ex::Captures,
file_path: &Path,
content: &str,
extension: &str,
) -> Option<Symbol> {
let name = cap.get(3)?.as_str().to_string();
let full_match = cap.get(0)?;
let line_num = content[..full_match.start()].lines().count() + 1;
let signature = cap.get(0)?.as_str().to_string();
Some(Symbol {
id: super::Symbol::create_id(
&file_path.to_string_lossy(),
&name,
&super::SymbolKind::Variable,
line_num,
),
name,
kind: super::SymbolKind::Variable,
file_path: file_path.to_string_lossy().to_string(),
line: line_num,
column: 0,
end_line: line_num,
signature,
documentation: None,
visibility: super::SymbolVisibility::Public,
parent: None,
type_info: None,
generics: vec![],
annotations: vec![],
attributes: vec![],
metadata: HashMap::new(),
})
}
}
impl Default for SymbolExtractor {
fn default() -> Self {
Self::new()
}
}
pub fn extract_symbols_from_file(file_path: &Path) -> Result<Vec<Symbol>, Box<dyn std::error::Error>> {
let extractor = SymbolExtractor::new();
extractor.extract_from_file(file_path)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::symbols::SymbolKind;
use std::fs;
use tempfile::tempdir;
#[test]
fn test_extract_rust_function() {
let dir = tempdir().unwrap();
let file_path = dir.path().join("test.rs");
fs::write(
&file_path,
r#"
pub fn my_function(param1: i32, param2: String) -> Result<i32> {
Ok(42)
}
"#,
).unwrap();
let extractor = SymbolExtractor::new();
let symbols = extractor.extract_from_file(&file_path).unwrap();
assert!(!symbols.is_empty());
let func = symbols.iter().find(|s| s.name == "my_function");
assert!(func.is_some());
let func = func.unwrap();
assert_eq!(func.kind, SymbolKind::Function);
assert!(func.signature.contains("my_function"));
}
#[test]
fn test_extract_python_class() {
let dir = tempdir().unwrap();
let file_path = dir.path().join("test.py");
fs::write(
&file_path,
r#"
class MyClass:
def __init__(self, value):
self.value = value
"#,
).unwrap();
let extractor = SymbolExtractor::new();
let symbols = extractor.extract_from_file(&file_path).unwrap();
assert!(!symbols.is_empty());
let class = symbols.iter().find(|s| s.name == "MyClass");
assert!(class.is_some());
}
#[test]
fn test_extract_typescript_interface() {
let dir = tempdir().unwrap();
let file_path = dir.path().join("test.ts");
fs::write(
&file_path,
r#"
export interface User {
id: number;
name: string;
}
"#,
).unwrap();
let extractor = SymbolExtractor::new();
let symbols = extractor.extract_from_file(&file_path).unwrap();
assert!(!symbols.is_empty());
let interface = symbols.iter().find(|s| s.name == "User");
assert!(interface.is_some());
}
}