use crate::types::{Tag, TagKind};
use anyhow::Result;
use once_cell::sync::Lazy;
use regex::Regex;
use std::path::Path;
use std::sync::Arc;
pub struct Parser;
impl Parser {
pub fn new() -> Self {
Self
}
pub fn parse_file(&self, path: &Path, rel_fname: &str) -> Result<Vec<Tag>> {
let content = std::fs::read_to_string(path)?;
let lang = detect_language(path);
match lang {
Language::Python => parse_python(&content, path, rel_fname),
Language::Rust => parse_rust(&content, path, rel_fname),
Language::JavaScript => parse_javascript(&content, path, rel_fname),
Language::TypeScript => parse_typescript(&content, path, rel_fname),
Language::Unknown => Ok(vec![]),
}
}
}
impl Default for Parser {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Language {
Python,
Rust,
JavaScript,
TypeScript,
Unknown,
}
fn detect_language(path: &Path) -> Language {
match path.extension().and_then(|e| e.to_str()) {
Some("py") => Language::Python,
Some("rs") => Language::Rust,
Some("js" | "jsx") => Language::JavaScript,
Some("ts" | "tsx") => Language::TypeScript,
_ => Language::Unknown,
}
}
fn line_number(content: &str, byte_offset: usize) -> u32 {
content[..byte_offset].matches('\n').count() as u32 + 1
}
mod python_patterns {
use super::*;
pub static CLASS: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?m)^class\s+(\w+)").expect("Invalid Python class regex"));
pub static FUNCTION: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?m)^def\s+(\w+)\s*\(").expect("Invalid Python function regex"));
pub static METHOD: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^(?: |\t)def\s+(\w+)\s*\(").expect("Invalid Python method regex")
});
pub static ASSIGNMENT: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^([A-Z_][A-Z0-9_]*)\s*=").expect("Invalid Python assignment regex")
});
}
fn parse_python(content: &str, path: &Path, rel_fname: &str) -> Result<Vec<Tag>> {
let mut tags = Vec::new();
let fname: Arc<str> = Arc::from(path.to_string_lossy().into_owned());
let rel: Arc<str> = Arc::from(rel_fname);
for cap in python_patterns::CLASS.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("class"),
parent_name: None,
parent_line: None,
signature: None,
fields: None, metadata: None,
});
}
for cap in python_patterns::FUNCTION.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("function"),
parent_name: None,
parent_line: None,
signature: None, fields: None,
metadata: None,
});
}
for cap in python_patterns::METHOD.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("method"),
parent_name: None, parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in python_patterns::ASSIGNMENT.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("constant"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
Ok(tags)
}
mod rust_patterns {
use super::*;
pub static FUNCTION: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:pub\s+)?(?:async\s+)?fn\s+(\w+)").expect("Invalid Rust fn regex")
});
pub static STRUCT: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:pub\s+)?struct\s+(\w+)").expect("Invalid Rust struct regex")
});
pub static ENUM: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:pub\s+)?enum\s+(\w+)").expect("Invalid Rust enum regex")
});
pub static TRAIT: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:pub\s+)?trait\s+(\w+)").expect("Invalid Rust trait regex")
});
pub static IMPL: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*impl(?:\s+\w+\s+for)?\s+(\w+)").expect("Invalid Rust impl regex")
});
pub static CONST: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:pub\s+)?const\s+(\w+)").expect("Invalid Rust const regex")
});
pub static STATIC: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:pub\s+)?static\s+(\w+)").expect("Invalid Rust static regex")
});
}
fn parse_rust(content: &str, path: &Path, rel_fname: &str) -> Result<Vec<Tag>> {
let mut tags = Vec::new();
let fname: Arc<str> = Arc::from(path.to_string_lossy().into_owned());
let rel: Arc<str> = Arc::from(rel_fname);
for cap in rust_patterns::FUNCTION.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("function"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in rust_patterns::STRUCT.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("struct"),
parent_name: None,
parent_line: None,
signature: None,
fields: None, metadata: None,
});
}
for cap in rust_patterns::ENUM.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("enum"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in rust_patterns::TRAIT.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("trait"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in rust_patterns::IMPL.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("impl"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in rust_patterns::CONST.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("const"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in rust_patterns::STATIC.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("static"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
Ok(tags)
}
mod js_patterns {
use super::*;
pub static FUNCTION: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:async\s+)?function\s+(\w+)\s*\(")
.expect("Invalid JS function regex")
});
pub static CLASS: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:export\s+)?class\s+(\w+)").expect("Invalid JS class regex")
});
pub static CONST_ARROW: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:export\s+)?const\s+(\w+)\s*=\s*(?:async\s*)?\(")
.expect("Invalid JS const arrow regex")
});
pub static CONST_ASSIGN: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:export\s+)?const\s+([A-Z_][A-Z0-9_]*)\s*=")
.expect("Invalid JS const regex")
});
pub static METHOD: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s+(?:async\s+)?(\w+)\s*\(").expect("Invalid JS method regex")
});
}
fn parse_javascript(content: &str, path: &Path, rel_fname: &str) -> Result<Vec<Tag>> {
let mut tags = Vec::new();
let fname: Arc<str> = Arc::from(path.to_string_lossy().into_owned());
let rel: Arc<str> = Arc::from(rel_fname);
for cap in js_patterns::CLASS.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("class"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in js_patterns::FUNCTION.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("function"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in js_patterns::CONST_ARROW.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("function"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in js_patterns::CONST_ASSIGN.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("constant"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
Ok(tags)
}
mod ts_patterns {
use super::*;
pub static INTERFACE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:export\s+)?interface\s+(\w+)").expect("Invalid TS interface regex")
});
pub static TYPE_ALIAS: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:export\s+)?type\s+(\w+)\s*=").expect("Invalid TS type regex")
});
pub static ENUM: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?m)^\s*(?:export\s+)?enum\s+(\w+)").expect("Invalid TS enum regex")
});
}
fn parse_typescript(content: &str, path: &Path, rel_fname: &str) -> Result<Vec<Tag>> {
let mut tags = parse_javascript(content, path, rel_fname)?;
let fname: Arc<str> = Arc::from(path.to_string_lossy().into_owned());
let rel: Arc<str> = Arc::from(rel_fname);
for cap in ts_patterns::INTERFACE.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("interface"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in ts_patterns::TYPE_ALIAS.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("type"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
for cap in ts_patterns::ENUM.captures_iter(content) {
let line = line_number(content, cap.get(0).unwrap().start());
tags.push(Tag {
rel_fname: rel.clone(),
fname: fname.clone(),
line,
name: Arc::from(&cap[1]),
kind: TagKind::Def,
node_type: Arc::from("enum"),
parent_name: None,
parent_line: None,
signature: None,
fields: None,
metadata: None,
});
}
Ok(tags)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_line_number() {
let content = "line 1\nline 2\nline 3\n";
assert_eq!(line_number(content, 0), 1);
assert_eq!(line_number(content, 7), 2);
assert_eq!(line_number(content, 14), 3);
}
#[test]
fn test_detect_language() {
assert_eq!(detect_language(Path::new("foo.py")), Language::Python);
assert_eq!(detect_language(Path::new("bar.rs")), Language::Rust);
assert_eq!(detect_language(Path::new("baz.js")), Language::JavaScript);
assert_eq!(detect_language(Path::new("qux.ts")), Language::TypeScript);
assert_eq!(detect_language(Path::new("unknown.txt")), Language::Unknown);
}
#[test]
fn test_parse_python_class() {
let content = "class Foo:\n pass\n";
let path = Path::new("test.py");
let tags = parse_python(content, path, "test.py").unwrap();
assert_eq!(tags.len(), 1);
assert_eq!(tags[0].name.as_ref(), "Foo");
assert_eq!(tags[0].node_type.as_ref(), "class");
assert_eq!(tags[0].line, 1);
}
#[test]
fn test_parse_python_function() {
let content = "def bar():\n pass\n";
let path = Path::new("test.py");
let tags = parse_python(content, path, "test.py").unwrap();
assert_eq!(tags.len(), 1);
assert_eq!(tags[0].name.as_ref(), "bar");
assert_eq!(tags[0].node_type.as_ref(), "function");
}
#[test]
fn test_parse_rust_function() {
let content = "pub fn foo() {}\n";
let path = Path::new("test.rs");
let tags = parse_rust(content, path, "test.rs").unwrap();
assert_eq!(tags.len(), 1);
assert_eq!(tags[0].name.as_ref(), "foo");
assert_eq!(tags[0].node_type.as_ref(), "function");
}
#[test]
fn test_parse_rust_struct() {
let content = "pub struct Bar {\n field: i32\n}\n";
let path = Path::new("test.rs");
let tags = parse_rust(content, path, "test.rs").unwrap();
assert_eq!(tags.len(), 1);
assert_eq!(tags[0].name.as_ref(), "Bar");
assert_eq!(tags[0].node_type.as_ref(), "struct");
}
#[test]
fn test_parse_javascript_class() {
let content = "class MyClass {\n constructor() {}\n}\n";
let path = Path::new("test.js");
let tags = parse_javascript(content, path, "test.js").unwrap();
assert_eq!(tags.len(), 1);
assert_eq!(tags[0].name.as_ref(), "MyClass");
assert_eq!(tags[0].node_type.as_ref(), "class");
}
#[test]
fn test_parse_typescript_interface() {
let content = "export interface IFoo {\n bar: string;\n}\n";
let path = Path::new("test.ts");
let tags = parse_typescript(content, path, "test.ts").unwrap();
assert!(
tags.iter()
.any(|t| t.name.as_ref() == "IFoo" && t.node_type.as_ref() == "interface")
);
}
}