Skip to main content

graphify_extract/ast_extract/
mod.rs

1//! Regex-based AST extraction engine.
2//!
3//! This module implements a **working** regex-based extractor for each supported
4//! language. It serves as the "Pass 1" deterministic extraction while tree-sitter
5//! grammar crates are being added to the workspace.
6//!
7//! For each source file the extractor produces:
8//! - A **file** node
9//! - **Class / struct / trait / interface** nodes
10//! - **Function / method** nodes with `defines` edges from their parent
11//! - **Import** nodes with `imports` edges from the file
12//! - **Calls** edges inferred by matching known function names within bodies
13
14mod c_cpp;
15mod csharp;
16mod generic;
17mod go;
18mod java;
19mod js_ts;
20mod kotlin;
21mod python;
22mod ruby;
23mod rust;
24
25use std::collections::HashMap;
26use std::path::Path;
27use std::sync::LazyLock;
28
29use graphify_core::confidence::Confidence;
30use graphify_core::id::make_id;
31use graphify_core::model::{ExtractionResult, GraphEdge, GraphNode, NodeType};
32use regex::Regex;
33
34macro_rules! re {
35    ($name:ident, $pattern:expr) => {
36        pub(crate) static $name: LazyLock<Regex> =
37            LazyLock::new(|| Regex::new($pattern).expect($pattern));
38    };
39}
40
41re!(RE_PY_CLASS, r"(?m)^(\s*)class\s+(\w+)");
42re!(RE_PY_CLASS_LOOKUP, r"^(\s*)class\s+(\w+)");
43re!(RE_PY_FUNC, r"(?m)^(\s*)def\s+(\w+)\s*\(");
44re!(
45    RE_PY_IMPORT,
46    r"(?m)^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s*]+)"
47);
48
49re!(
50    RE_JS_CLASS,
51    r"(?m)(?:export\s+)?(?:default\s+)?class\s+(\w+)"
52);
53re!(
54    RE_JS_FUNC,
55    r"(?m)(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+(\w+)\s*\(|(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>"
56);
57re!(
58    RE_JS_IMPORT,
59    r#"(?m)import\s+(?:\{([^}]+)\}|(\w+))\s+from\s+['"]([^'"]+)['"]|import\s+['"]([^'"]+)['"]"#
60);
61re!(
62    RE_JS_REQUIRE,
63    r#"(?m)(?:const|let|var)\s+(\w+)\s*=\s*require\s*\(\s*['"]([^'"]+)['"]\s*\)"#
64);
65
66re!(
67    RE_RS_STRUCT,
68    r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?struct\s+(\w+)"
69);
70re!(RE_RS_ENUM, r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?enum\s+(\w+)");
71re!(
72    RE_RS_TRAIT,
73    r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?trait\s+(\w+)"
74);
75re!(
76    RE_RS_IMPL,
77    r"(?m)^(?:\s*)impl(?:<[^>]*>)?\s+(?:(\w+)\s+for\s+)?(\w+)"
78);
79re!(
80    RE_RS_FUNC,
81    r"(?m)^(\s*)(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:unsafe\s+)?(?:const\s+)?fn\s+(\w+)"
82);
83re!(RE_RS_USE, r"(?m)^(?:\s*)(?:pub\s+)?use\s+([\w:]+)");
84
85re!(RE_GO_TYPE, r"(?m)^type\s+(\w+)\s+(struct|interface)");
86re!(RE_GO_FUNC, r"(?m)^func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(");
87re!(RE_GO_IMPORT_SINGLE, r#"(?m)^import\s+"([^"]+)""#);
88re!(RE_GO_IMPORT_BLOCK, r"(?s)import\s*\(([^)]+)\)");
89re!(RE_GO_IMPORT_LINE, r#""([^"]+)""#);
90
91re!(
92    RE_JAVA_CLASS,
93    r"(?m)(?:public\s+|private\s+|protected\s+)?(?:abstract\s+|static\s+|final\s+)*(class|interface|enum)\s+(\w+)"
94);
95re!(
96    RE_JAVA_METHOD,
97    r"(?m)^\s+(?:public\s+|private\s+|protected\s+)?(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?(?:abstract\s+)?(?:\w+(?:<[^>]*>)?)\s+(\w+)\s*\("
98);
99re!(RE_JAVA_IMPORT, r"(?m)^import\s+(?:static\s+)?([\w.]+)\s*;");
100
101re!(RE_C_INCLUDE, r#"(?m)^#include\s+[<"]([^>"]+)[>"]"#);
102re!(
103    RE_CPP_CLASS,
104    r"(?m)^(?:\s*)(?:class|struct|namespace)\s+(\w+)"
105);
106re!(RE_C_STRUCT, r"(?m)^(?:typedef\s+)?struct\s+(\w+)");
107re!(
108    RE_C_FUNC,
109    r"(?m)^(?:static\s+)?(?:inline\s+)?(?:extern\s+)?(?:const\s+)?(?:unsigned\s+)?(?:signed\s+)?(?:\w+(?:\s*\*\s*|\s+))(\w+)\s*\([^;]*\)\s*\{"
110);
111
112re!(RE_RB_CLASS, r"(?m)^\s*(class|module)\s+(\w+(?:::\w+)*)");
113re!(RE_RB_FUNC, r"(?m)^\s*def\s+(self\.)?(\w+[?!=]?)");
114re!(
115    RE_RB_REQUIRE,
116    r#"(?m)^\s*require(?:_relative)?\s+['"]([^'"]+)['"]"#
117);
118
119re!(
120    RE_CS_CLASS,
121    r"(?m)(?:public\s+|private\s+|protected\s+|internal\s+)?(?:abstract\s+|static\s+|sealed\s+|partial\s+)*(class|interface|struct|enum)\s+(\w+)"
122);
123re!(
124    RE_CS_METHOD,
125    r"(?m)^\s+(?:public\s+|private\s+|protected\s+|internal\s+)?(?:static\s+)?(?:virtual\s+)?(?:override\s+)?(?:async\s+)?(?:\w+(?:<[^>]*>)?)\s+(\w+)\s*\("
126);
127re!(RE_CS_USING, r"(?m)^using\s+([\w.]+)\s*;");
128
129re!(
130    RE_KT_CLASS,
131    r"(?m)(?:open\s+|abstract\s+|data\s+|sealed\s+)?(?:class|object|interface)\s+(\w+)"
132);
133re!(
134    RE_KT_FUNC,
135    r"(?m)^\s*(?:(?:private|public|protected|internal|override|open|suspend)\s+)*fun\s+(?:<[^>]+>\s+)?(\w+)\s*\("
136);
137re!(RE_KT_IMPORT, r"(?m)^import\s+([\w.]+)");
138
139re!(
140    RE_GEN_CLASS,
141    r"(?m)^\s*(?:(?:pub|public|private|protected|internal|open|abstract|sealed|partial|static|final|export)\s+)*(?:class|struct|module|object|interface|trait|protocol|enum|defmodule)\s+(\w+(?:::\w+)*)"
142);
143re!(
144    RE_GEN_FUNC,
145    r"(?m)^\s*(?:(?:pub|public|private|protected|internal|open|override|suspend|static|async|export|def|defp)\s+)*(?:func|function|fn|def|defp|fun|sub)\s+(\w+[?!]?)\s*[\(<]"
146);
147re!(
148    RE_GEN_IMPORT,
149    r#"(?m)^\s*(?:import|use|using|require|include|from)\s+['"]?([\w./:-]+)['"]?"#
150);
151
152/// Extract graph nodes and edges from a single source file.
153pub fn extract_file(path: &Path, source: &str, lang: &str) -> ExtractionResult {
154    match lang {
155        "python" => python::extract_python(path, source),
156        "javascript" | "typescript" => js_ts::extract_js_ts(path, source, lang),
157        "rust" => rust::extract_rust(path, source),
158        "go" => go::extract_go(path, source),
159        "java" => java::extract_java(path, source),
160        "c" | "cpp" => c_cpp::extract_c_cpp(path, source, lang),
161        "ruby" => ruby::extract_ruby(path, source),
162        "csharp" => csharp::extract_csharp(path, source),
163        "kotlin" => kotlin::extract_kotlin(path, source),
164        _ => generic::extract_generic(path, source, lang),
165    }
166}
167
168pub(crate) fn file_stem(path: &Path) -> String {
169    path.file_stem()
170        .and_then(|s| s.to_str())
171        .unwrap_or("unknown")
172        .to_string()
173}
174
175pub(crate) fn path_str(path: &Path) -> String {
176    path.to_string_lossy().into_owned()
177}
178
179pub(crate) fn make_file_node(path: &Path) -> GraphNode {
180    let ps = path_str(path);
181    GraphNode {
182        id: make_id(&[&ps]),
183        label: file_stem(path),
184        source_file: ps,
185        source_location: None,
186        node_type: NodeType::File,
187        community: None,
188        extra: HashMap::new(),
189    }
190}
191
192pub(crate) fn make_node(name: &str, path: &Path, node_type: NodeType, line: usize) -> GraphNode {
193    let ps = path_str(path);
194    GraphNode {
195        id: make_id(&[&ps, name]),
196        label: name.to_string(),
197        source_file: ps,
198        source_location: Some(format!("L{line}")),
199        node_type,
200        community: None,
201        extra: HashMap::new(),
202    }
203}
204
205pub(crate) fn make_edge(
206    source_id: &str,
207    target_id: &str,
208    relation: &str,
209    path: &Path,
210    confidence: Confidence,
211) -> GraphEdge {
212    GraphEdge {
213        source: source_id.to_string(),
214        target: target_id.to_string(),
215        relation: relation.to_string(),
216        confidence: confidence.clone(),
217        confidence_score: confidence.default_score(),
218        source_file: path_str(path),
219        source_location: None,
220        weight: 1.0,
221        provenance: Some(format!("regex:{relation}")),
222        extra: HashMap::new(),
223    }
224}
225
226/// Line number (1-based) where a regex capture starts in `source`.
227pub(crate) fn line_of(source: &str, cap: &regex::Captures<'_>) -> usize {
228    source[..cap.get(0).unwrap().start()].lines().count() + 1
229}
230
231/// 1-based end line of `source` at byte offset of the next capture's start,
232/// or end of file if this is the last capture.
233pub(crate) fn end_line_at(source: &str, next: Option<&regex::Captures<'_>>) -> usize {
234    match next {
235        Some(n) => source[..n.get(0).unwrap().start()].lines().count(),
236        None => source.lines().count(),
237    }
238}
239
240/// Full matched string of capture group 0.
241pub(crate) fn full_match<'a>(cap: &regex::Captures<'a>) -> &'a str {
242    cap.get(0).unwrap().as_str()
243}
244
245/// Simple call-graph inference: for each function body, look for occurrences
246/// of other known function names.
247pub(crate) fn infer_calls(
248    functions: &[(String, String, usize, usize)], // (name, id, start_line, end_line)
249    source_lines: &[&str],
250    path: &Path,
251) -> Vec<GraphEdge> {
252    let mut edges = Vec::new();
253    for (_caller_name, caller_id, start, end) in functions {
254        let body = source_lines
255            .get(*start..*end)
256            .unwrap_or_default()
257            .join("\n");
258        for (callee_name, callee_id, _, _) in functions {
259            if caller_id == callee_id {
260                continue;
261            }
262            let pattern = format!(r"\b{}\s*\(", regex::escape(callee_name));
263            if let Ok(re) = regex::Regex::new(&pattern) {
264                for mat in re.find_iter(&body) {
265                    let start = mat.start();
266                    if start > 0 && body.as_bytes()[start - 1] == b'.' {
267                        continue;
268                    }
269                    edges.push(make_edge(
270                        caller_id,
271                        callee_id,
272                        "calls",
273                        path,
274                        Confidence::Inferred,
275                    ));
276                    break;
277                }
278            }
279        }
280    }
281    edges
282}