mod csharp;
mod go;
mod java;
pub mod python;
mod rust;
mod typescript;
mod c;
mod cpp;
pub mod streaming;
pub mod lightweight;
pub mod lightweight_parser;
pub use lightweight_parser::parse_file_lightweight;
use anyhow::Result;
use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::sync::LazyLock;
use tree_sitter::Node;
const MAX_PARSE_FILE_BYTES: u64 = 2 * 1024 * 1024;
#[derive(Debug, Clone)]
pub struct CachedFunctionFP {
pub name: String,
pub line_start: u32,
pub line_end: u32,
pub structural_kinds: HashSet<String>,
pub normalized_bigrams: HashSet<String>,
pub identifiers: Vec<String>,
pub patterns: Vec<crate::detectors::BoilerplatePattern>,
pub minhash_sig: Option<[u64; 100]>,
}
static FP_CACHE: LazyLock<dashmap::DashMap<String, Vec<CachedFunctionFP>>> =
LazyLock::new(dashmap::DashMap::new);
pub fn get_cached_fps(file_path: &str) -> Option<Vec<CachedFunctionFP>> {
FP_CACHE
.get(file_path)
.map(|entry| entry.value().clone())
}
pub fn clear_structural_fingerprint_cache() {
FP_CACHE.clear();
}
pub(crate) fn find_containing_scope(
line: u32,
scope_map: &HashMap<(u32, u32), String>,
) -> Option<String> {
scope_map
.iter()
.filter(|((start, end), _)| line >= *start && line <= *end)
.min_by_key(|((start, end), _)| end - start)
.map(|(_, name)| name.clone())
}
pub(crate) fn is_inside_ancestor(node: &Node, ancestor_kind: &str) -> bool {
let mut current = node.parent();
while let Some(parent) = current {
if parent.kind() == ancestor_kind {
return true;
}
current = parent.parent();
}
false
}
fn is_probably_cpp_header(source: &str) -> bool {
let text = &source[..source.len().min(16 * 1024)];
let cpp_markers = [
"class ",
"namespace ",
"template<",
"template <",
"typename ",
"constexpr",
"std::",
"using namespace",
"#include <iostream>",
"#include <vector>",
"#include <string>",
];
cpp_markers.iter().any(|m| text.contains(m))
}
pub fn parse_file(path: &Path) -> Result<ParseResult> {
parse_file_inner(path, false)
}
pub fn parse_file_with_values(path: &Path) -> Result<ParseResult> {
parse_file_inner(path, true)
}
fn parse_file_inner(path: &Path, extract_values: bool) -> Result<ParseResult> {
if let Ok(meta) = std::fs::metadata(path) {
if meta.len() > MAX_PARSE_FILE_BYTES {
tracing::warn!(
"Skipping {} ({:.1}MB exceeds {}MB guardrail)",
path.display(),
meta.len() as f64 / (1024.0 * 1024.0),
MAX_PARSE_FILE_BYTES / (1024 * 1024),
);
return Ok(ParseResult::default());
}
}
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
let source = match crate::cache::global_cache().content(path) {
Some(s) => s,
None => return Ok(ParseResult::default()),
};
let (mut parsed, tree) = match ext {
"py" | "pyi" => python::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?,
"ts" | "tsx" | "js" | "jsx" | "mjs" | "cjs" => {
typescript::parse_source_with_tree(&source, path, ext).map(|(r, t)| (Ok(r), Some(t)))?
}
"rs" => rust::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?,
"go" => go::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?,
"java" => java::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?,
"cs" => csharp::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?,
"kt" | "kts" => (Ok(ParseResult::default()), None),
"c" => c::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?,
"h" => {
if is_probably_cpp_header(&source) {
cpp::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?
} else {
c::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?
}
}
"cpp" | "cc" | "cxx" | "c++" | "hpp" | "hh" | "hxx" | "h++" => {
cpp::parse_source_with_tree(&source, path).map(|(r, t)| (Ok(r), Some(t)))?
}
_ => (Ok(ParseResult::default()), None),
};
if let Ok(ref mut result) = parsed {
enrich_nesting_depths(result, &source, path);
}
if let (Ok(ref result), Some(ref tree)) = (&parsed, &tree) {
extract_full_fps(tree, &source, ext, path, &result.functions);
}
if extract_values {
if let (Ok(ref mut result), Some(ref tree)) = (&mut parsed, &tree) {
if let Some(config) = crate::values::configs::config_for_extension(ext) {
let file_qualified_prefix = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown");
let raw = crate::values::extraction::extract_file_values(
tree,
&source,
&config,
&result.functions,
file_qualified_prefix,
);
result.raw_values = Some(raw);
}
}
}
if let Some(ref tree) = tree {
let masked = crate::cache::masking::mask_non_code_with_tree(&source, ext, tree);
crate::cache::global_cache().store_masked(path, masked);
} else {
let _ = crate::cache::global_cache().masked_content(path);
}
parsed
}
fn extract_full_fps(
tree: &tree_sitter::Tree,
source: &str,
ext: &str,
path: &Path,
functions: &[crate::models::Function],
) {
use crate::detectors::ast_fingerprint;
let lang = crate::parsers::lightweight::Language::from_extension(ext);
let func_kinds = ast_fingerprint::function_node_kinds(lang);
if func_kinds.is_empty() {
return;
}
let path_str = path.to_string_lossy().to_string();
let func_set: HashMap<(String, u32), u32> = functions
.iter()
.map(|f| ((f.name.clone(), f.line_start), f.line_end))
.collect();
let func_kind_set: HashSet<&str> = func_kinds.iter().copied().collect();
let mut fps = Vec::new();
collect_full_fps_recursive(
tree.root_node(),
source,
&func_kind_set,
&func_set,
&mut fps,
);
for fp in &mut fps {
if !fp.normalized_bigrams.is_empty() {
fp.minhash_sig = Some(
crate::detectors::ast_fingerprint::compute_minhash_signature(&fp.normalized_bigrams),
);
}
}
if !fps.is_empty() {
FP_CACHE.insert(path_str, fps);
}
}
fn collect_full_fps_recursive(
node: tree_sitter::Node,
source: &str,
func_kinds: &HashSet<&str>,
func_set: &HashMap<(String, u32), u32>,
out: &mut Vec<CachedFunctionFP>,
) {
use crate::detectors::ast_fingerprint;
if func_kinds.contains(node.kind()) {
let line_start = node.start_position().row as u32 + 1; let name = node
.child_by_field_name("name")
.map(|n| n.utf8_text(source.as_bytes()).unwrap_or_default().to_string())
.unwrap_or_default();
if let Some(&line_end) = func_set.get(&(name.clone(), line_start)) {
let body_node = node.child_by_field_name("body").unwrap_or(node);
let mut normalized_tokens = Vec::new();
let mut structural_kinds = HashSet::new();
let mut identifiers = Vec::new();
let mut all_kinds = HashSet::new();
ast_fingerprint::collect_all_features(
body_node,
source,
&mut normalized_tokens,
&mut structural_kinds,
&mut identifiers,
&mut all_kinds,
);
let mut normalized_bigrams = HashSet::new();
for pair in normalized_tokens.windows(2) {
normalized_bigrams.insert(format!("{}:{}", pair[0], pair[1]));
}
let body_text = &source[body_node.start_byte()..body_node.end_byte()];
let patterns = ast_fingerprint::detect_patterns_from_data(&all_kinds, body_text);
if !structural_kinds.is_empty() || !normalized_bigrams.is_empty() {
out.push(CachedFunctionFP {
name,
line_start,
line_end,
structural_kinds,
normalized_bigrams,
identifiers,
patterns,
minhash_sig: None,
});
}
}
}
let count = node.child_count();
for i in 0..count {
if let Some(child) = node.child(i) {
collect_full_fps_recursive(child, source, func_kinds, func_set, out);
}
}
}
fn enrich_nesting_depths(result: &mut ParseResult, source: &str, path: &Path) {
let lines: Vec<&str> = source.lines().collect();
let is_python = path.extension().is_some_and(|e| e == "py" || e == "pyi");
for func in &mut result.functions {
if func.max_nesting.is_some() {
continue;
}
let start = func.line_start.saturating_sub(1) as usize;
let end = (func.line_end as usize).min(lines.len());
if start >= end {
continue;
}
let max_depth = if is_python {
compute_nesting_indent(&lines[start..end])
} else {
compute_nesting_braces(&lines[start..end])
};
func.max_nesting = Some(max_depth);
}
}
fn compute_nesting_braces(lines: &[&str]) -> u32 {
let mut depth: i32 = 0;
let mut max_depth: i32 = 0;
let mut found_first = false;
for line in lines {
let trimmed = line.trim();
if trimmed.starts_with("//") || trimmed.starts_with('*') || trimmed.starts_with("/*") {
continue;
}
for ch in trimmed.chars() {
match ch {
'{' => {
if found_first {
depth += 1;
max_depth = max_depth.max(depth);
} else {
found_first = true;
}
}
'}' => {
if found_first {
depth -= 1;
}
}
_ => {}
}
}
}
max_depth.max(0) as u32
}
fn compute_nesting_indent(lines: &[&str]) -> u32 {
if lines.is_empty() {
return 0;
}
let base_indent = lines[0].len() - lines[0].trim_start().len();
let mut max_extra = 0u32;
for line in &lines[1..] {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let indent = line.len() - line.trim_start().len();
if indent > base_indent {
let extra = ((indent - base_indent) / 4) as u32;
max_extra = max_extra.max(extra);
}
}
max_extra
}
pub fn language_for_extension(ext: &str) -> Option<&'static str> {
match ext {
"py" | "pyi" => Some("Python"),
"ts" | "tsx" => Some("TypeScript"),
"js" | "jsx" | "mjs" | "cjs" => Some("JavaScript"),
"rs" => Some("Rust"),
"go" => Some("Go"),
"java" => Some("Java"),
"cs" => Some("C#"),
"kt" | "kts" => Some("Kotlin"),
"c" | "h" => Some("C"),
"cpp" | "cc" | "cxx" | "c++" | "hpp" | "hh" | "hxx" | "h++" => Some("C++"),
_ => None,
}
}
#[allow(dead_code)] pub fn supported_extensions() -> &'static [&'static str] {
&[
"py", "pyi", "ts", "tsx", "js", "jsx", "mjs", "cjs", "rs", "go", "java", "cs", "kt", "kts", "c", "h", "cpp", "cc", "cxx", "c++", "hpp", "hh", "hxx", "h++", ]
}
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct ImportInfo {
pub path: String,
pub is_type_only: bool,
}
impl ImportInfo {
pub fn runtime(path: impl Into<String>) -> Self {
Self {
path: path.into(),
is_type_only: false,
}
}
pub fn type_only(path: impl Into<String>) -> Self {
Self {
path: path.into(),
is_type_only: true,
}
}
}
#[derive(Debug, Default, Clone, serde::Serialize, serde::Deserialize)]
pub struct ParseResult {
pub functions: Vec<crate::models::Function>,
pub classes: Vec<crate::models::Class>,
pub imports: Vec<ImportInfo>,
pub calls: Vec<(String, String)>,
pub address_taken: std::collections::HashSet<String>,
#[serde(default)]
pub trait_impls: Vec<(String, String)>,
#[serde(skip)]
pub raw_values: Option<crate::values::store::RawParseValues>,
}
impl ParseResult {
pub fn new() -> Self {
Self::default()
}
pub fn merge(&mut self, other: ParseResult) {
self.functions.extend(other.functions);
self.classes.extend(other.classes);
self.imports.extend(other.imports);
self.calls.extend(other.calls);
self.address_taken.extend(other.address_taken);
self.trait_impls.extend(other.trait_impls);
if let Some(other_raw) = other.raw_values {
if let Some(ref mut self_raw) = self.raw_values {
self_raw.module_constants.extend(other_raw.module_constants);
for (k, v) in other_raw.function_assignments {
self_raw.function_assignments.entry(k).or_default().extend(v);
}
for (k, v) in other_raw.return_expressions {
self_raw.return_expressions.insert(k, v);
}
} else {
self.raw_values = Some(other_raw);
}
}
}
pub fn is_empty(&self) -> bool {
self.functions.is_empty()
&& self.classes.is_empty()
&& self.imports.is_empty()
&& self.calls.is_empty()
}
pub fn entity_count(&self) -> usize {
self.functions.len() + self.classes.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_parse_python_file() {
let path = PathBuf::from("test.py");
let _ = parse_file(&path);
}
#[test]
fn test_unknown_extension_returns_empty() {
let path = PathBuf::from("test.unknown");
let result = parse_file(&path).expect("should parse unknown extension");
assert!(result.is_empty());
}
#[test]
fn test_parse_result_merge() {
use crate::models::{Class, Function};
let mut result1 = ParseResult {
functions: vec![Function {
name: "func1".to_string(),
qualified_name: "test::func1:1".to_string(),
file_path: PathBuf::from("test.py"),
line_start: 1,
line_end: 5,
parameters: vec![],
return_type: None,
is_async: false,
complexity: None,
max_nesting: None,
doc_comment: None,
annotations: vec![],
}],
classes: vec![],
imports: vec![ImportInfo::runtime("os")],
calls: vec![],
address_taken: std::collections::HashSet::new(),
trait_impls: vec![],
raw_values: None,
};
let result2 = ParseResult {
functions: vec![Function {
name: "func2".to_string(),
qualified_name: "test::func2:10".to_string(),
file_path: PathBuf::from("test.py"),
line_start: 10,
line_end: 15,
parameters: vec![],
return_type: None,
is_async: true,
complexity: None,
max_nesting: None,
doc_comment: None,
annotations: vec![],
}],
classes: vec![Class {
name: "MyClass".to_string(),
qualified_name: "test::MyClass:20".to_string(),
file_path: PathBuf::from("test.py"),
line_start: 20,
line_end: 30,
methods: vec![],
field_count: 0,
bases: vec![],
doc_comment: None,
annotations: vec![],
}],
imports: vec![ImportInfo::runtime("sys")],
calls: vec![("test::func1:1".to_string(), "func2".to_string())],
address_taken: std::collections::HashSet::new(),
trait_impls: vec![("MyStruct".to_string(), "MyTrait".to_string())],
raw_values: None,
};
result1.merge(result2);
assert_eq!(result1.functions.len(), 2);
assert_eq!(result1.classes.len(), 1);
assert_eq!(result1.imports.len(), 2);
assert_eq!(result1.calls.len(), 1);
assert_eq!(result1.trait_impls.len(), 1);
assert_eq!(result1.trait_impls[0].0, "MyStruct");
assert_eq!(result1.trait_impls[0].1, "MyTrait");
assert_eq!(result1.entity_count(), 3);
}
#[test]
fn test_language_for_extension() {
assert_eq!(language_for_extension("py"), Some("Python"));
assert_eq!(language_for_extension("ts"), Some("TypeScript"));
assert_eq!(language_for_extension("rs"), Some("Rust"));
assert_eq!(language_for_extension("go"), Some("Go"));
assert_eq!(language_for_extension("java"), Some("Java"));
assert_eq!(language_for_extension("cs"), Some("C#"));
assert_eq!(language_for_extension("kt"), Some("Kotlin"));
assert_eq!(language_for_extension("c"), Some("C"));
assert_eq!(language_for_extension("cpp"), Some("C++"));
assert_eq!(language_for_extension("unknown"), None);
}
#[test]
fn test_header_dispatch_cpp_heuristic() {
let dir = tempfile::tempdir().expect("should create temp dir");
let hdr = dir.path().join("test.h");
std::fs::write(
&hdr,
r#"
namespace demo {
class Widget { public: int x; };
}
"#,
)
.expect("should write test header");
let result = parse_file(&hdr).expect("should parse C++ header");
assert_eq!(result.classes.len(), 1);
}
#[test]
fn test_header_dispatch_c_fallback() {
let dir = tempfile::tempdir().expect("should create temp dir");
let hdr = dir.path().join("test.h");
std::fs::write(
&hdr,
r#"
#ifndef TEST_H
#define TEST_H
int add(int a, int b);
#endif
"#,
)
.expect("should write test header");
let result = parse_file(&hdr).expect("should parse C header");
assert!(result.functions.is_empty());
assert!(result.classes.is_empty());
}
#[test]
fn test_parse_file_skips_very_large_files() {
let dir = tempfile::tempdir().expect("should create temp dir");
let big = dir.path().join("big.py");
let payload = "x = 1\n".repeat((2 * 1024 * 1024 / 6) + 1024);
std::fs::write(&big, payload).expect("should write large file");
let result = parse_file(&big).expect("should handle large file");
assert!(result.is_empty());
}
#[test]
fn test_supported_extensions() {
let exts = supported_extensions();
assert!(exts.contains(&"py"));
assert!(exts.contains(&"ts"));
assert!(exts.contains(&"rs"));
assert!(exts.contains(&"go"));
assert!(exts.contains(&"java"));
assert!(exts.contains(&"cs"));
assert!(exts.contains(&"kt"));
assert!(exts.contains(&"c"));
assert!(exts.contains(&"cpp"));
}
#[test]
fn test_parse_python_extracts_values() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.py");
std::fs::write(
&file,
"TIMEOUT = 3600\n\ndef foo():\n x = \"hello\"\n return x\n",
)
.unwrap();
let result = parse_file_with_values(&file).unwrap();
let raw = result
.raw_values
.as_ref()
.expect("raw_values should be populated");
assert!(
!raw.module_constants.is_empty(),
"should extract module constant TIMEOUT"
);
}
#[test]
fn test_parse_typescript_extracts_values() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.ts");
std::fs::write(
&file,
"const MAX = 100;\nfunction foo() { return MAX; }\n",
)
.unwrap();
let result = parse_file_with_values(&file).unwrap();
let raw = result
.raw_values
.as_ref()
.expect("raw_values should be populated for TS");
assert!(
!raw.module_constants.is_empty(),
"should extract TS constant"
);
}
#[test]
fn test_parse_unsupported_extension_no_values() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.xyz");
std::fs::write(&file, "x = 1").unwrap();
let result = parse_file(&file).unwrap();
assert!(
result.raw_values.is_none(),
"unsupported extension should have no values"
);
}
#[test]
fn test_parse_file_skips_value_extraction() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.py");
std::fs::write(&file, "TIMEOUT = 3600\n").unwrap();
let result = parse_file(&file).unwrap();
assert!(
result.raw_values.is_none(),
"parse_file() should skip value extraction for streaming perf"
);
}
}