graphify_extract/ast_extract/
mod.rs1mod c_cpp;
15mod csharp;
16mod generic;
17mod go;
18mod java;
19mod js_ts;
20mod kotlin;
21mod python;
22mod ruby;
23mod rust;
24
25use std::collections::HashMap;
26use std::path::Path;
27use std::sync::LazyLock;
28
29use graphify_core::confidence::Confidence;
30use graphify_core::id::make_id;
31use graphify_core::model::{ExtractionResult, GraphEdge, GraphNode, NodeType};
32use regex::Regex;
33
34macro_rules! re {
35 ($name:ident, $pattern:expr) => {
36 pub(crate) static $name: LazyLock<Regex> =
37 LazyLock::new(|| Regex::new($pattern).expect($pattern));
38 };
39}
40
41re!(RE_PY_CLASS, r"(?m)^(\s*)class\s+(\w+)");
42re!(RE_PY_CLASS_LOOKUP, r"^(\s*)class\s+(\w+)");
43re!(RE_PY_FUNC, r"(?m)^(\s*)def\s+(\w+)\s*\(");
44re!(
45 RE_PY_IMPORT,
46 r"(?m)^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s*]+)"
47);
48
49re!(
50 RE_JS_CLASS,
51 r"(?m)(?:export\s+)?(?:default\s+)?class\s+(\w+)"
52);
53re!(
54 RE_JS_FUNC,
55 r"(?m)(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+(\w+)\s*\(|(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>"
56);
57re!(
58 RE_JS_IMPORT,
59 r#"(?m)import\s+(?:\{([^}]+)\}|(\w+))\s+from\s+['"]([^'"]+)['"]|import\s+['"]([^'"]+)['"]"#
60);
61re!(
62 RE_JS_REQUIRE,
63 r#"(?m)(?:const|let|var)\s+(\w+)\s*=\s*require\s*\(\s*['"]([^'"]+)['"]\s*\)"#
64);
65
66re!(
67 RE_RS_STRUCT,
68 r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?struct\s+(\w+)"
69);
70re!(RE_RS_ENUM, r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?enum\s+(\w+)");
71re!(
72 RE_RS_TRAIT,
73 r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?trait\s+(\w+)"
74);
75re!(
76 RE_RS_IMPL,
77 r"(?m)^(?:\s*)impl(?:<[^>]*>)?\s+(?:(\w+)\s+for\s+)?(\w+)"
78);
79re!(
80 RE_RS_FUNC,
81 r"(?m)^(\s*)(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:unsafe\s+)?(?:const\s+)?fn\s+(\w+)"
82);
83re!(RE_RS_USE, r"(?m)^(?:\s*)(?:pub\s+)?use\s+([\w:]+)");
84
85re!(RE_GO_TYPE, r"(?m)^type\s+(\w+)\s+(struct|interface)");
86re!(RE_GO_FUNC, r"(?m)^func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(");
87re!(RE_GO_IMPORT_SINGLE, r#"(?m)^import\s+"([^"]+)""#);
88re!(RE_GO_IMPORT_BLOCK, r"(?s)import\s*\(([^)]+)\)");
89re!(RE_GO_IMPORT_LINE, r#""([^"]+)""#);
90
91re!(
92 RE_JAVA_CLASS,
93 r"(?m)(?:public\s+|private\s+|protected\s+)?(?:abstract\s+|static\s+|final\s+)*(class|interface|enum)\s+(\w+)"
94);
95re!(
96 RE_JAVA_METHOD,
97 r"(?m)^\s+(?:public\s+|private\s+|protected\s+)?(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?(?:abstract\s+)?(?:\w+(?:<[^>]*>)?)\s+(\w+)\s*\("
98);
99re!(RE_JAVA_IMPORT, r"(?m)^import\s+(?:static\s+)?([\w.]+)\s*;");
100
101re!(RE_C_INCLUDE, r#"(?m)^#include\s+[<"]([^>"]+)[>"]"#);
102re!(
103 RE_CPP_CLASS,
104 r"(?m)^(?:\s*)(?:class|struct|namespace)\s+(\w+)"
105);
106re!(RE_C_STRUCT, r"(?m)^(?:typedef\s+)?struct\s+(\w+)");
107re!(
108 RE_C_FUNC,
109 r"(?m)^(?:static\s+)?(?:inline\s+)?(?:extern\s+)?(?:const\s+)?(?:unsigned\s+)?(?:signed\s+)?(?:\w+(?:\s*\*\s*|\s+))(\w+)\s*\([^;]*\)\s*\{"
110);
111
112re!(RE_RB_CLASS, r"(?m)^\s*(class|module)\s+(\w+(?:::\w+)*)");
113re!(RE_RB_FUNC, r"(?m)^\s*def\s+(self\.)?(\w+[?!=]?)");
114re!(
115 RE_RB_REQUIRE,
116 r#"(?m)^\s*require(?:_relative)?\s+['"]([^'"]+)['"]"#
117);
118
119re!(
120 RE_CS_CLASS,
121 r"(?m)(?:public\s+|private\s+|protected\s+|internal\s+)?(?:abstract\s+|static\s+|sealed\s+|partial\s+)*(class|interface|struct|enum)\s+(\w+)"
122);
123re!(
124 RE_CS_METHOD,
125 r"(?m)^\s+(?:public\s+|private\s+|protected\s+|internal\s+)?(?:static\s+)?(?:virtual\s+)?(?:override\s+)?(?:async\s+)?(?:\w+(?:<[^>]*>)?)\s+(\w+)\s*\("
126);
127re!(RE_CS_USING, r"(?m)^using\s+([\w.]+)\s*;");
128
129re!(
130 RE_KT_CLASS,
131 r"(?m)(?:open\s+|abstract\s+|data\s+|sealed\s+)?(?:class|object|interface)\s+(\w+)"
132);
133re!(
134 RE_KT_FUNC,
135 r"(?m)^\s*(?:(?:private|public|protected|internal|override|open|suspend)\s+)*fun\s+(?:<[^>]+>\s+)?(\w+)\s*\("
136);
137re!(RE_KT_IMPORT, r"(?m)^import\s+([\w.]+)");
138
139re!(
140 RE_GEN_CLASS,
141 r"(?m)^\s*(?:(?:pub|public|private|protected|internal|open|abstract|sealed|partial|static|final|export)\s+)*(?:class|struct|module|object|interface|trait|protocol|enum|defmodule)\s+(\w+(?:::\w+)*)"
142);
143re!(
144 RE_GEN_FUNC,
145 r"(?m)^\s*(?:(?:pub|public|private|protected|internal|open|override|suspend|static|async|export|def|defp)\s+)*(?:func|function|fn|def|defp|fun|sub)\s+(\w+[?!]?)\s*[\(<]"
146);
147re!(
148 RE_GEN_IMPORT,
149 r#"(?m)^\s*(?:import|use|using|require|include|from)\s+['"]?([\w./:-]+)['"]?"#
150);
151
152pub fn extract_file(path: &Path, source: &str, lang: &str) -> ExtractionResult {
154 match lang {
155 "python" => python::extract_python(path, source),
156 "javascript" | "typescript" => js_ts::extract_js_ts(path, source, lang),
157 "rust" => rust::extract_rust(path, source),
158 "go" => go::extract_go(path, source),
159 "java" => java::extract_java(path, source),
160 "c" | "cpp" => c_cpp::extract_c_cpp(path, source, lang),
161 "ruby" => ruby::extract_ruby(path, source),
162 "csharp" => csharp::extract_csharp(path, source),
163 "kotlin" => kotlin::extract_kotlin(path, source),
164 _ => generic::extract_generic(path, source, lang),
165 }
166}
167
168pub(crate) fn file_stem(path: &Path) -> String {
169 path.file_stem()
170 .and_then(|s| s.to_str())
171 .unwrap_or("unknown")
172 .to_string()
173}
174
175pub(crate) fn path_str(path: &Path) -> String {
176 path.to_string_lossy().into_owned()
177}
178
179pub(crate) fn make_file_node(path: &Path) -> GraphNode {
180 let ps = path_str(path);
181 GraphNode {
182 id: make_id(&[&ps]),
183 label: file_stem(path),
184 source_file: ps,
185 source_location: None,
186 node_type: NodeType::File,
187 community: None,
188 extra: HashMap::new(),
189 }
190}
191
192pub(crate) fn make_node(name: &str, path: &Path, node_type: NodeType, line: usize) -> GraphNode {
193 let ps = path_str(path);
194 GraphNode {
195 id: make_id(&[&ps, name]),
196 label: name.to_string(),
197 source_file: ps,
198 source_location: Some(format!("L{line}")),
199 node_type,
200 community: None,
201 extra: HashMap::new(),
202 }
203}
204
205pub(crate) fn make_edge(
206 source_id: &str,
207 target_id: &str,
208 relation: &str,
209 path: &Path,
210 confidence: Confidence,
211) -> GraphEdge {
212 GraphEdge {
213 source: source_id.to_string(),
214 target: target_id.to_string(),
215 relation: relation.to_string(),
216 confidence: confidence.clone(),
217 confidence_score: confidence.default_score(),
218 source_file: path_str(path),
219 source_location: None,
220 weight: 1.0,
221 extra: HashMap::new(),
222 }
223}
224
225pub(crate) fn line_of(source: &str, cap: ®ex::Captures<'_>) -> usize {
227 source[..cap.get(0).unwrap().start()].lines().count() + 1
228}
229
230pub(crate) fn end_line_at(source: &str, next: Option<®ex::Captures<'_>>) -> usize {
233 match next {
234 Some(n) => source[..n.get(0).unwrap().start()].lines().count(),
235 None => source.lines().count(),
236 }
237}
238
239pub(crate) fn full_match<'a>(cap: ®ex::Captures<'a>) -> &'a str {
241 cap.get(0).unwrap().as_str()
242}
243
244pub(crate) fn infer_calls(
247 functions: &[(String, String, usize, usize)], source_lines: &[&str],
249 path: &Path,
250) -> Vec<GraphEdge> {
251 let mut edges = Vec::new();
252 for (_caller_name, caller_id, start, end) in functions {
253 let body = source_lines
254 .get(*start..*end)
255 .unwrap_or_default()
256 .join("\n");
257 for (callee_name, callee_id, _, _) in functions {
258 if caller_id == callee_id {
259 continue;
260 }
261 let pattern = format!(r"\b{}\s*\(", regex::escape(callee_name));
262 if let Ok(re) = regex::Regex::new(&pattern) {
263 for mat in re.find_iter(&body) {
264 let start = mat.start();
265 if start > 0 && body.as_bytes()[start - 1] == b'.' {
266 continue;
267 }
268 edges.push(make_edge(
269 caller_id,
270 callee_id,
271 "calls",
272 path,
273 Confidence::Inferred,
274 ));
275 break;
276 }
277 }
278 }
279 }
280 edges
281}