mod javascript_parser;
mod python_parser;
mod rust_parser;
mod typescript_parser;
use std::collections::{HashSet, VecDeque};
use std::path::Path;
use seshat_core::{FunctionCall, Language, ProjectFile};
use sha2::{Digest, Sha256};
use tree_sitter::Node;
use crate::ScanError;
use javascript_parser::JavaScriptParser;
use python_parser::PythonParser;
use rust_parser::RustParser;
use seshat_core::ir::DependencyUsage;
use typescript_parser::TypeScriptParser;
pub trait Parser {
fn parse(&self, path: &Path, source: &str) -> Result<ProjectFile, ScanError>;
}
pub(super) fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str {
node.utf8_text(source).unwrap_or("")
}
pub(super) fn find_child_node<'a>(node: &'a Node, kind: &str) -> Option<Node<'a>> {
(0..node.child_count())
.filter_map(|i| node.child(i as u32))
.find(|c| c.kind() == kind)
}
pub(super) fn find_child_text(node: &Node, kind: &str, source: &[u8]) -> Option<String> {
find_child_node(node, kind).map(|n| node_text(&n, source).to_string())
}
pub(super) fn has_child_kind(node: &Node, kind: &str) -> bool {
find_child_node(node, kind).is_some()
}
pub(crate) const MAX_FUNCTION_CALLS_PER_FILE: usize = 500;
pub(crate) const CALL_SNIPPET_LINES_BEFORE: usize = 2;
pub(crate) const CALL_SNIPPET_LINES_AFTER: usize = 4;
pub(crate) const CALL_SNIPPET_MAX_LINES: usize = 30;
pub fn build_call_snippet_from_lines(
source_lines: &[&str],
line: usize,
end_line: usize,
) -> String {
let total = source_lines.len();
if total == 0 || line == 0 || end_line == 0 {
return String::new();
}
let call_start_0 = (line - 1).min(total - 1);
let call_end_0 = (end_line - 1).min(total - 1);
let call_end_0 = call_end_0.max(call_start_0);
let snippet_start = call_start_0.saturating_sub(CALL_SNIPPET_LINES_BEFORE);
let snippet_end_uncapped = (call_end_0 + CALL_SNIPPET_LINES_AFTER + 1).min(total);
let snippet_end = snippet_end_uncapped.min(snippet_start + CALL_SNIPPET_MAX_LINES);
source_lines[snippet_start..snippet_end].join("\n")
}
pub fn build_call_snippet(source: &str, line: usize, end_line: usize) -> String {
let lines: Vec<&str> = source.lines().collect();
build_call_snippet_from_lines(&lines, line, end_line)
}
pub fn collect_calls_bfs<F>(
root: &tree_sitter::Node,
source: &str,
call_kind: &str,
skip_kinds: &[&str],
extract_fn: F,
out: &mut Vec<FunctionCall>,
) where
F: Fn(&tree_sitter::Node, &str, &[&str]) -> Option<FunctionCall>,
{
let source_lines: Vec<&str> = source.lines().collect();
let mut seen: HashSet<String> = HashSet::new();
let mut queue: VecDeque<(tree_sitter::Node, usize)> = VecDeque::new();
for i in 0..root.child_count() {
if let Some(child) = root.child(i as u32) {
queue.push_back((child, 0));
}
}
const MAX_DEPTH: usize = 60;
while let Some((node, depth)) = queue.pop_front() {
if out.len() >= MAX_FUNCTION_CALLS_PER_FILE {
break;
}
if depth > MAX_DEPTH {
continue;
}
if skip_kinds.contains(&node.kind()) {
continue;
}
if node.kind() == call_kind {
if let Some(call) = extract_fn(&node, source, &source_lines) {
if seen.insert(call.callee.clone()) {
out.push(call);
}
}
}
for i in 0..node.child_count() {
if let Some(child) = node.child(i as u32) {
queue.push_back((child, depth + 1));
}
}
}
}
pub(super) fn collect_rust_doc_comment(node: &Node, source: &[u8]) -> Option<String> {
let mut comments: Vec<String> = Vec::new();
let mut current = node.prev_sibling();
while let Some(prev) = current {
match prev.kind() {
"line_comment" => {
let text = node_text(&prev, source);
if let Some(doc) = text.strip_prefix("///") {
comments.push(doc.trim().to_owned());
current = prev.prev_sibling();
continue;
}
break;
}
"attribute_item" => {
current = prev.prev_sibling();
}
_ => break,
}
}
if comments.is_empty() {
return None;
}
comments.reverse();
Some(comments.join("\n"))
}
pub(super) fn extract_js_ts_file_doc(root: &Node, source: &[u8]) -> Option<String> {
for i in 0..(root.child_count()) {
let Some(child) = root.child(i as u32) else {
break;
};
if child.kind() == "comment" {
let raw = node_text(&child, source);
let cleaned = clean_js_comment(raw);
return if cleaned.is_empty() {
None
} else {
Some(cleaned)
};
}
if child.kind() != "hash_bang_line" {
break;
}
}
None
}
pub(super) fn collect_js_doc_comment(node: &Node, source: &[u8]) -> Option<String> {
let prev = node.prev_named_sibling()?;
if prev.kind() != "comment" {
return None;
}
let raw = node_text(&prev, source);
let cleaned = clean_js_comment(raw);
if cleaned.is_empty() {
None
} else {
Some(cleaned)
}
}
pub(super) fn clean_js_comment(raw: &str) -> String {
let s = raw.trim();
if s.starts_with("/*") && s.ends_with("*/") {
let prefix_len = if s.starts_with("/**") { 3 } else { 2 };
let inner = &s[prefix_len..s.len() - 2];
return inner
.lines()
.map(|l| l.trim().trim_start_matches('*').trim())
.filter(|l| !l.is_empty())
.collect::<Vec<_>>()
.join(" ");
}
if let Some(rest) = s.strip_prefix("//") {
return rest.trim().to_owned();
}
s.to_owned()
}
pub(super) fn extract_python_docstring(block: &Node, source: &[u8]) -> Option<String> {
let first = block.named_child(0)?;
if first.kind() != "expression_statement" {
return None;
}
let expr = first.named_child(0)?;
if expr.kind() == "string" {
let raw = node_text(&expr, source);
return Some(clean_python_docstring(raw));
}
None
}
fn clean_python_docstring(raw: &str) -> String {
let s = raw.trim();
for delim in &[r#"""""#, "'''"] {
let dlen = delim.len(); if s.starts_with(delim) && s.ends_with(delim) && s.len() >= dlen * 2 {
let inner = &s[dlen..s.len() - dlen];
return inner.trim().to_owned();
}
}
for delim in &[r#"""#, "'"] {
if s.starts_with(delim) && s.ends_with(delim) && s.len() >= 2 {
let inner = &s[1..s.len() - 1];
return inner.trim().to_owned();
}
}
s.to_owned()
}
pub(super) fn extract_string_value(node: &Node, source: &[u8]) -> Option<String> {
let string_node = find_child_node(node, "string")?;
let fragment = find_child_node(&string_node, "string_fragment")?;
Some(node_text(&fragment, source).to_string())
}
pub(super) fn extract_import_names(clause: &Node, source: &[u8]) -> Vec<String> {
let mut names = Vec::new();
for i in 0..(clause.child_count()) {
let Some(child) = clause.child(i as u32) else {
continue;
};
match child.kind() {
"identifier" => {
names.push(node_text(&child, source).to_string());
}
"named_imports" => {
for j in 0..(child.child_count()) {
if let Some(spec) = child.child(j as u32) {
if spec.kind() == "import_specifier" {
if let Some(name_node) = spec.child(0) {
names.push(node_text(&name_node, source).to_string());
}
}
}
}
}
"namespace_import" => {
if let Some(alias) = find_child_text(&child, "identifier", source) {
names.push(format!("* as {alias}"));
} else {
names.push("*".to_string());
}
}
_ => {}
}
}
names
}
#[allow(clippy::too_many_arguments)]
pub(super) fn extract_exported_lexical(
node: &Node,
source: &[u8],
exports: &mut Vec<seshat_core::Export>,
functions: &mut Vec<seshat_core::Function>,
is_default: bool,
line: usize,
end_line: usize,
) {
for i in 0..(node.child_count()) {
let Some(child) = node.child(i as u32) else {
continue;
};
if child.kind() == "variable_declarator" {
let name = find_child_text(&child, "identifier", source).unwrap_or_default();
let func_node = find_arrow_or_function_expr(&child);
let is_func = func_node.is_some();
if is_func {
let is_async = child_has_async_value(&child, source);
let parameters = func_node
.map(|n| extract_js_ts_parameters(&n, source))
.unwrap_or_default();
functions.push(seshat_core::Function {
name: name.clone(),
is_public: true,
is_async,
line: child.start_position().row + 1,
end_line: child.end_position().row + 1,
parameters,
doc_comment: None,
});
}
if !name.is_empty() {
exports.push(seshat_core::Export {
name,
is_default,
is_type_only: false,
line,
end_line,
});
}
}
}
}
pub(super) fn extract_function_declaration(node: &Node, source: &[u8]) -> seshat_core::Function {
let name = find_child_text(node, "identifier", source).unwrap_or_default();
let is_async = has_child_kind(node, "async");
let parameters = extract_js_ts_parameters(node, source);
seshat_core::Function {
name,
is_public: false, is_async,
line: node.start_position().row + 1,
end_line: node.end_position().row + 1,
parameters,
doc_comment: None,
}
}
pub(super) fn child_has_async_value(declarator: &Node, source: &[u8]) -> bool {
for i in 0..(declarator.child_count()) {
if let Some(child) = declarator.child(i as u32) {
if child.kind() == "arrow_function" || child.kind() == "function_expression" {
return has_child_kind(&child, "async");
}
}
}
node_text(declarator, source).contains("async")
}
pub(super) fn find_arrow_or_function_expr<'a>(declarator: &'a Node) -> Option<Node<'a>> {
for i in 0..(declarator.child_count()) {
if let Some(child) = declarator.child(i as u32) {
match child.kind() {
"arrow_function" | "function_expression" => return Some(child),
_ => {}
}
}
}
None
}
pub(super) fn extract_js_ts_parameters(func_node: &Node, source: &[u8]) -> Vec<String> {
let Some(params) = find_child_node(func_node, "formal_parameters") else {
return Vec::new();
};
let mut names = Vec::new();
for i in 0..(params.child_count()) {
let Some(child) = params.child(i as u32) else {
continue;
};
match child.kind() {
"identifier" => {
let name = node_text(&child, source).to_string();
if !name.is_empty() {
names.push(name);
}
}
"required_parameter" | "optional_parameter" => {
if let Some(name) = find_child_text(&child, "identifier", source) {
if !name.is_empty() {
names.push(name);
}
}
}
"assignment_pattern" => {
if let Some(first) = child.child(0) {
if first.kind() == "identifier" {
let name = node_text(&first, source).to_string();
if !name.is_empty() {
names.push(name);
}
}
}
}
"rest_pattern" => {
if let Some(name) = find_child_text(&child, "identifier", source) {
if !name.is_empty() {
names.push(name);
}
}
}
_ => {}
}
}
names
}
pub fn content_hash(source: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(source.as_bytes());
let hash = hasher.finalize();
let mut hex = String::with_capacity(hash.len() * 2);
for byte in hash {
use std::fmt::Write;
let _ = write!(hex, "{byte:02x}");
}
hex
}
pub fn read_and_parse_file(
abs_path: &Path,
stored_path: &Path,
language: Language,
local_packages: &[String],
) -> std::io::Result<(ProjectFile, String)> {
let source = std::fs::read_to_string(abs_path)?;
let mut project_file = parse_file(stored_path, &source, language);
if !local_packages.is_empty() {
project_file
.dependencies_used
.retain(|dep| !local_packages.contains(&dep.package));
}
Ok((project_file, source))
}
pub fn parse_file(path: &Path, source: &str, language: Language) -> ProjectFile {
let parser: &dyn Parser = match language {
Language::Rust => &RustParser,
Language::TypeScript => &TypeScriptParser,
Language::JavaScript => &JavaScriptParser,
Language::Python => &PythonParser,
};
let hash = content_hash(source);
match parser.parse(path, source) {
Ok(mut pf) => {
pf.content_hash = hash;
pf
}
Err(e) => {
tracing::warn!(path = %path.display(), error = %e, "Parser failed; returning empty IR");
empty_project_file(path, language, hash)
}
}
}
fn empty_project_file(path: &Path, language: Language, hash: String) -> ProjectFile {
use seshat_core::*;
let language_ir = match language {
Language::Rust => LanguageIR::Rust(RustIR::default()),
Language::TypeScript => LanguageIR::TypeScript(TypeScriptIR::default()),
Language::JavaScript => LanguageIR::JavaScript(JavaScriptIR::default()),
Language::Python => LanguageIR::Python(PythonIR::default()),
};
ProjectFile {
path: path.to_path_buf(),
language,
content_hash: hash,
imports: Vec::new(),
exports: Vec::new(),
functions: Vec::new(),
types: Vec::new(),
dependencies_used: Vec::new(),
language_ir,
file_doc: None,
}
}
pub(super) fn is_rust_builtin(module: &str) -> bool {
let first = module.split("::").next().unwrap_or(module);
matches!(
first,
"std" | "core" | "alloc" | "proc_macro" | "test" | "self" | "super" | "crate"
)
}
pub(super) fn is_python_stdlib_or_relative(module: &str) -> bool {
if module.starts_with('.') {
return true;
}
let root = module.split('.').next().unwrap_or(module);
matches!(
root,
"os" | "sys"
| "re"
| "json"
| "math"
| "io"
| "abc"
| "ast"
| "copy"
| "datetime"
| "enum"
| "functools"
| "itertools"
| "logging"
| "pathlib"
| "typing"
| "collections"
| "dataclasses"
| "contextlib"
| "subprocess"
| "threading"
| "asyncio"
| "time"
| "hashlib"
| "hmac"
| "base64"
| "urllib"
| "http"
| "email"
| "csv"
| "sqlite3"
| "unittest"
| "tempfile"
| "shutil"
| "glob"
| "inspect"
| "traceback"
| "warnings"
| "weakref"
| "gc"
| "struct"
| "socket"
| "ssl"
| "uuid"
| "string"
| "textwrap"
| "random"
| "secrets"
| "decimal"
| "fractions"
| "statistics"
| "pprint"
| "builtins"
| "__future__"
| "typing_extensions"
| "types"
| "operator"
| "argparse"
| "configparser"
| "xml"
| "zipfile"
| "tarfile"
| "pickle"
| "shelve"
| "queue"
| "shlex"
| "platform"
| "multiprocessing"
| "concurrent"
| "signal"
| "fnmatch"
| "difflib"
| "dis"
| "compileall"
| "runpy"
| "importlib"
| "pkgutil"
| "ctypes"
| "array"
| "bisect"
| "heapq"
| "pdb"
| "profile"
| "cProfile"
| "timeit"
| "doctest"
| "getopt"
| "getpass"
| "curses"
| "readline"
| "rlcompleter"
| "zipimport"
| "zlib"
| "gzip"
| "bz2"
| "lzma"
)
}
pub(super) fn is_ts_js_builtin(module: &str) -> bool {
module.starts_with("./")
|| module.starts_with("../")
|| module.starts_with("@/") || module.starts_with("~/") || module.starts_with("node:") || module.starts_with('#') }
pub(super) fn ts_package_name(module: &str) -> String {
if let Some(rest) = module.strip_prefix('@') {
let segments: Vec<&str> = rest.splitn(3, '/').collect();
if segments.len() >= 2 {
return format!("@{}/{}", segments[0], segments[1]);
}
return format!("@{}", rest);
}
module.split('/').next().unwrap_or(module).to_owned()
}
pub(super) fn rust_dep_from_import(module: &str, line: usize) -> Option<DependencyUsage> {
if is_rust_builtin(module) {
return None;
}
let package = module.split("::").next().unwrap_or(module).to_owned();
Some(DependencyUsage {
package,
import_path: module.to_owned(),
line,
})
}
pub(super) fn python_dep_from_import(module: &str, line: usize) -> Option<DependencyUsage> {
if is_python_stdlib_or_relative(module) {
return None;
}
let package = module.split('.').next().unwrap_or(module).to_owned();
Some(DependencyUsage {
package,
import_path: module.to_owned(),
line,
})
}
pub(super) fn ts_dep_from_import(module: &str, line: usize) -> Option<DependencyUsage> {
if is_ts_js_builtin(module) {
return None;
}
let package = ts_package_name(module);
Some(DependencyUsage {
package,
import_path: module.to_owned(),
line,
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn content_hash_deterministic() {
let a = content_hash("hello world");
let b = content_hash("hello world");
assert_eq!(a, b);
assert!(!a.is_empty());
}
#[test]
fn content_hash_differs_for_different_input() {
let a = content_hash("hello");
let b = content_hash("world");
assert_ne!(a, b);
}
#[test]
fn content_hash_is_sha256_hex() {
let h = content_hash("hello world");
assert_eq!(h.len(), 64);
assert!(h.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn dispatch_selects_rust_parser() {
let path = PathBuf::from("src/main.rs");
let pf = parse_file(&path, "fn main() {}", Language::Rust);
assert_eq!(pf.language, Language::Rust);
assert_eq!(pf.path, path);
assert!(!pf.content_hash.is_empty());
assert!(matches!(pf.language_ir, seshat_core::LanguageIR::Rust(_)));
}
#[test]
fn dispatch_selects_typescript_parser() {
let path = PathBuf::from("src/index.ts");
let pf = parse_file(&path, "export const x = 1;", Language::TypeScript);
assert_eq!(pf.language, Language::TypeScript);
assert!(matches!(
pf.language_ir,
seshat_core::LanguageIR::TypeScript(_)
));
}
#[test]
fn dispatch_selects_javascript_parser() {
let path = PathBuf::from("src/index.js");
let pf = parse_file(&path, "const x = 1;", Language::JavaScript);
assert_eq!(pf.language, Language::JavaScript);
assert!(matches!(
pf.language_ir,
seshat_core::LanguageIR::JavaScript(_)
));
}
#[test]
fn dispatch_selects_python_parser() {
let path = PathBuf::from("src/main.py");
let pf = parse_file(&path, "def main(): pass", Language::Python);
assert_eq!(pf.language, Language::Python);
assert!(matches!(pf.language_ir, seshat_core::LanguageIR::Python(_)));
}
#[test]
fn rust_builtin_filter() {
assert!(is_rust_builtin("std"));
assert!(is_rust_builtin("std::io"));
assert!(is_rust_builtin("core::fmt"));
assert!(is_rust_builtin("alloc::vec"));
assert!(is_rust_builtin("crate::foo"));
assert!(is_rust_builtin("super::bar"));
assert!(is_rust_builtin("self::baz"));
assert!(!is_rust_builtin("reqwest"));
assert!(!is_rust_builtin("serde::Serialize"));
assert!(!is_rust_builtin("tokio::runtime"));
}
#[test]
fn python_builtin_filter() {
assert!(is_python_stdlib_or_relative("os"));
assert!(is_python_stdlib_or_relative("sys"));
assert!(is_python_stdlib_or_relative("typing"));
assert!(is_python_stdlib_or_relative(".relative"));
assert!(is_python_stdlib_or_relative("..parent"));
assert!(!is_python_stdlib_or_relative("requests"));
assert!(!is_python_stdlib_or_relative("fastapi"));
assert!(!is_python_stdlib_or_relative("pydantic"));
}
#[test]
fn ts_package_name_extraction() {
assert_eq!(ts_package_name("react"), "react");
assert_eq!(ts_package_name("react/hooks"), "react");
assert_eq!(ts_package_name("@angular/core"), "@angular/core");
assert_eq!(ts_package_name("@angular/core/testing"), "@angular/core");
}
#[test]
fn ts_builtin_filter() {
assert!(is_ts_js_builtin("./local"));
assert!(is_ts_js_builtin("../parent"));
assert!(is_ts_js_builtin("@/alias"));
assert!(is_ts_js_builtin("~/home"));
assert!(is_ts_js_builtin("node:fs"));
assert!(is_ts_js_builtin("#internal"));
assert!(!is_ts_js_builtin("react"));
assert!(!is_ts_js_builtin("@angular/core"));
assert!(!is_ts_js_builtin("axios"));
}
#[test]
fn content_hash_computed_in_shared_code() {
let source = "fn main() {}";
let expected_hash = content_hash(source);
let pf = parse_file(Path::new("test.rs"), source, Language::Rust);
assert_eq!(pf.content_hash, expected_hash);
}
#[test]
fn all_language_variants_dispatched() {
let languages = [
Language::Rust,
Language::TypeScript,
Language::JavaScript,
Language::Python,
];
for lang in languages {
let pf = parse_file(Path::new("test"), "source", lang);
assert_eq!(pf.language, lang);
assert!(!pf.content_hash.is_empty());
}
}
}