Skip to main content

graphify_extract/
ast_extract.rs

1//! Regex-based AST extraction engine.
2//!
3//! This module implements a **working** regex-based extractor for each supported
4//! language. It serves as the "Pass 1" deterministic extraction while tree-sitter
5//! grammar crates are being added to the workspace.
6//!
7//! For each source file the extractor produces:
8//! - A **file** node
9//! - **Class / struct / trait / interface** nodes
10//! - **Function / method** nodes with `defines` edges from their parent
11//! - **Import** nodes with `imports` edges from the file
12//! - **Calls** edges inferred by matching known function names within bodies
13
14use std::collections::HashMap;
15use std::path::Path;
16
17use graphify_core::confidence::Confidence;
18use graphify_core::id::make_id;
19use graphify_core::model::{ExtractionResult, GraphEdge, GraphNode, NodeType};
20use regex::Regex;
21use tracing::trace;
22
23// ═══════════════════════════════════════════════════════════════════════════
24// Public entry point
25// ═══════════════════════════════════════════════════════════════════════════
26
27/// Extract graph nodes and edges from a single source file.
28pub fn extract_file(path: &Path, source: &str, lang: &str) -> ExtractionResult {
29    match lang {
30        "python" => extract_python(path, source),
31        "javascript" | "typescript" => extract_js_ts(path, source, lang),
32        "rust" => extract_rust(path, source),
33        "go" => extract_go(path, source),
34        "java" => extract_java(path, source),
35        "c" | "cpp" => extract_c_cpp(path, source, lang),
36        "ruby" => extract_ruby(path, source),
37        "csharp" => extract_csharp(path, source),
38        "kotlin" => extract_kotlin(path, source),
39        _ => extract_generic(path, source, lang),
40    }
41}
42
43// ═══════════════════════════════════════════════════════════════════════════
44// Helpers
45// ═══════════════════════════════════════════════════════════════════════════
46
47fn file_stem(path: &Path) -> String {
48    path.file_stem()
49        .and_then(|s| s.to_str())
50        .unwrap_or("unknown")
51        .to_string()
52}
53
54fn path_str(path: &Path) -> String {
55    path.to_string_lossy().into_owned()
56}
57
58fn make_file_node(path: &Path) -> GraphNode {
59    let ps = path_str(path);
60    GraphNode {
61        id: make_id(&[&ps]),
62        label: file_stem(path),
63        source_file: ps,
64        source_location: None,
65        node_type: NodeType::File,
66        community: None,
67        extra: HashMap::new(),
68    }
69}
70
71fn make_node(name: &str, path: &Path, node_type: NodeType, line: usize) -> GraphNode {
72    let ps = path_str(path);
73    GraphNode {
74        id: make_id(&[&ps, name]),
75        label: name.to_string(),
76        source_file: ps,
77        source_location: Some(format!("L{line}")),
78        node_type,
79        community: None,
80        extra: HashMap::new(),
81    }
82}
83
84fn make_edge(
85    source_id: &str,
86    target_id: &str,
87    relation: &str,
88    path: &Path,
89    confidence: Confidence,
90) -> GraphEdge {
91    GraphEdge {
92        source: source_id.to_string(),
93        target: target_id.to_string(),
94        relation: relation.to_string(),
95        confidence: confidence.clone(),
96        confidence_score: confidence.default_score(),
97        source_file: path_str(path),
98        source_location: None,
99        weight: 1.0,
100        extra: HashMap::new(),
101    }
102}
103
104/// Simple call-graph inference: for each function body, look for occurrences
105/// of other known function names.
106fn infer_calls(
107    functions: &[(String, String, usize, usize)], // (name, id, start_line, end_line)
108    source_lines: &[&str],
109    path: &Path,
110) -> Vec<GraphEdge> {
111    let mut edges = Vec::new();
112    for (_caller_name, caller_id, start, end) in functions {
113        let body = source_lines
114            .get(*start..*end)
115            .unwrap_or_default()
116            .join("\n");
117        for (callee_name, callee_id, _, _) in functions {
118            if caller_id == callee_id {
119                continue;
120            }
121            // Check if callee_name appears in caller body as a call (name followed by `(`)
122            let pattern = format!(r"\b{}\s*\(", regex::escape(callee_name));
123            if let Ok(re) = Regex::new(&pattern)
124                && re.is_match(&body)
125            {
126                edges.push(make_edge(
127                    caller_id,
128                    callee_id,
129                    "calls",
130                    path,
131                    Confidence::Inferred,
132                ));
133            }
134        }
135    }
136    edges
137}
138
139// ═══════════════════════════════════════════════════════════════════════════
140// Python
141// ═══════════════════════════════════════════════════════════════════════════
142
143fn extract_python(path: &Path, source: &str) -> ExtractionResult {
144    let mut result = ExtractionResult::default();
145    let file_node = make_file_node(path);
146    let file_id = file_node.id.clone();
147    result.nodes.push(file_node);
148
149    let lines: Vec<&str> = source.lines().collect();
150    let ps = path_str(path);
151
152    // Classes: `class Foo(Bar):`  or `class Foo:`
153    let re_class = Regex::new(r"(?m)^(\s*)class\s+(\w+)").unwrap();
154    let re_class_lookup = Regex::new(r"^(\s*)class\s+(\w+)").unwrap();
155    let mut class_ids: HashMap<String, String> = HashMap::new();
156    for cap in re_class.captures_iter(source) {
157        let name = &cap[2];
158        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
159        let node = make_node(name, path, NodeType::Class, line);
160        let node_id = node.id.clone();
161        class_ids.insert(name.to_string(), node_id.clone());
162        result.nodes.push(node);
163        result.edges.push(make_edge(
164            &file_id,
165            &node_id,
166            "defines",
167            path,
168            Confidence::Extracted,
169        ));
170    }
171
172    // Functions / methods: `def foo(...):`
173    let re_func = Regex::new(r"(?m)^(\s*)def\s+(\w+)\s*\(").unwrap();
174    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
175    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
176    for (i, cap) in func_matches.iter().enumerate() {
177        let indent = cap[1].len();
178        let name = cap[2].to_string();
179        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
180
181        let node_type = if indent > 0 {
182            NodeType::Method
183        } else {
184            NodeType::Function
185        };
186        let node = make_node(&name, path, node_type, start_line);
187        let node_id = node.id.clone();
188
189        // Determine parent: if indented, belong to nearest class above with less indent
190        let parent_id = if indent > 0 {
191            // Find enclosing class by checking lines above for `class` with less indent
192            let mut parent = None;
193            for line_idx in (0..start_line.saturating_sub(1)).rev() {
194                if let Some(line) = lines.get(line_idx)
195                    && let Some(cls_cap) = re_class_lookup.captures(line)
196                    && cls_cap[1].len() < indent
197                {
198                    parent = class_ids.get(&cls_cap[2]).cloned();
199                    break;
200                }
201            }
202            parent.unwrap_or_else(|| file_id.clone())
203        } else {
204            file_id.clone()
205        };
206
207        // End line: next function at same or lower indent, or end of file
208        let end_line = if i + 1 < func_matches.len() {
209            source[..func_matches[i + 1].get(0).unwrap().start()]
210                .lines()
211                .count()
212        } else {
213            lines.len()
214        };
215
216        functions.push((name.clone(), node_id.clone(), start_line, end_line));
217        result.nodes.push(node);
218        result.edges.push(make_edge(
219            &parent_id,
220            &node_id,
221            "defines",
222            path,
223            Confidence::Extracted,
224        ));
225    }
226
227    // Imports: `import X` / `from X import Y`
228    let re_import = Regex::new(r"(?m)^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s*]+)").unwrap();
229    for cap in re_import.captures_iter(source) {
230        let module = cap.get(1).map_or("", |m| m.as_str());
231        let names_str = &cap[2];
232        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
233
234        for name in names_str.split(',') {
235            let name = name.trim().split(" as ").next().unwrap_or("").trim();
236            if name.is_empty() || name == "*" {
237                continue;
238            }
239            let full_name = if module.is_empty() {
240                name.to_string()
241            } else {
242                format!("{module}.{name}")
243            };
244            let import_id = make_id(&[&ps, "import", &full_name]);
245            result.nodes.push(GraphNode {
246                id: import_id.clone(),
247                label: full_name,
248                source_file: ps.clone(),
249                source_location: Some(format!("L{line}")),
250                node_type: NodeType::Module,
251                community: None,
252                extra: HashMap::new(),
253            });
254            result.edges.push(make_edge(
255                &file_id,
256                &import_id,
257                "imports",
258                path,
259                Confidence::Extracted,
260            ));
261        }
262    }
263
264    // Infer calls
265    let call_edges = infer_calls(&functions, &lines, path);
266    result.edges.extend(call_edges);
267
268    trace!(
269        "python: {} nodes, {} edges from {}",
270        result.nodes.len(),
271        result.edges.len(),
272        ps
273    );
274    result
275}
276
277// ═══════════════════════════════════════════════════════════════════════════
278// JavaScript / TypeScript
279// ═══════════════════════════════════════════════════════════════════════════
280
281fn extract_js_ts(path: &Path, source: &str, lang: &str) -> ExtractionResult {
282    let mut result = ExtractionResult::default();
283    let file_node = make_file_node(path);
284    let file_id = file_node.id.clone();
285    result.nodes.push(file_node);
286
287    let lines: Vec<&str> = source.lines().collect();
288    let ps = path_str(path);
289
290    // Classes: `class Foo` / `export class Foo`
291    let re_class = Regex::new(r"(?m)(?:export\s+)?(?:default\s+)?class\s+(\w+)").unwrap();
292    for cap in re_class.captures_iter(source) {
293        let name = &cap[1];
294        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
295        let node = make_node(name, path, NodeType::Class, line);
296        let node_id = node.id.clone();
297        result.nodes.push(node);
298        result.edges.push(make_edge(
299            &file_id,
300            &node_id,
301            "defines",
302            path,
303            Confidence::Extracted,
304        ));
305    }
306
307    // Functions: `function foo(` / `const foo = (` / `const foo = async (`
308    // Also: `export function foo(` / `export default function foo(`
309    let re_func = Regex::new(
310        r"(?m)(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+(\w+)\s*\(|(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>"
311    )
312    .unwrap();
313    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
314    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
315
316    for (i, cap) in func_matches.iter().enumerate() {
317        let name = cap
318            .get(1)
319            .or(cap.get(2))
320            .map(|m| m.as_str().to_string())
321            .unwrap_or_default();
322        if name.is_empty() {
323            continue;
324        }
325        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
326        let end_line = if i + 1 < func_matches.len() {
327            source[..func_matches[i + 1].get(0).unwrap().start()]
328                .lines()
329                .count()
330        } else {
331            lines.len()
332        };
333
334        let node = make_node(&name, path, NodeType::Function, start_line);
335        let node_id = node.id.clone();
336        functions.push((name, node_id.clone(), start_line, end_line));
337        result.nodes.push(node);
338        result.edges.push(make_edge(
339            &file_id,
340            &node_id,
341            "defines",
342            path,
343            Confidence::Extracted,
344        ));
345    }
346
347    // Imports: `import { X } from 'Y'` / `import X from 'Y'` / `import 'Y'`
348    let re_import = Regex::new(
349        r#"(?m)import\s+(?:\{([^}]+)\}|(\w+))\s+from\s+['"]([^'"]+)['"]|import\s+['"]([^'"]+)['"]"#,
350    )
351    .unwrap();
352    for cap in re_import.captures_iter(source) {
353        let module = cap.get(3).or(cap.get(4)).map(|m| m.as_str()).unwrap_or("");
354        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
355
356        if let Some(names) = cap.get(1) {
357            for name in names.as_str().split(',') {
358                let name = name.trim().split(" as ").next().unwrap_or("").trim();
359                if name.is_empty() {
360                    continue;
361                }
362                let full = format!("{module}/{name}");
363                let import_id = make_id(&[&ps, "import", &full]);
364                result.nodes.push(GraphNode {
365                    id: import_id.clone(),
366                    label: full,
367                    source_file: ps.clone(),
368                    source_location: Some(format!("L{line}")),
369                    node_type: NodeType::Module,
370                    community: None,
371                    extra: HashMap::new(),
372                });
373                result.edges.push(make_edge(
374                    &file_id,
375                    &import_id,
376                    "imports",
377                    path,
378                    Confidence::Extracted,
379                ));
380            }
381        } else if let Some(default_name) = cap.get(2) {
382            let name = default_name.as_str();
383            let import_id = make_id(&[&ps, "import", module]);
384            result.nodes.push(GraphNode {
385                id: import_id.clone(),
386                label: name.to_string(),
387                source_file: ps.clone(),
388                source_location: Some(format!("L{line}")),
389                node_type: NodeType::Module,
390                community: None,
391                extra: HashMap::new(),
392            });
393            result.edges.push(make_edge(
394                &file_id,
395                &import_id,
396                "imports",
397                path,
398                Confidence::Extracted,
399            ));
400        }
401    }
402
403    // Also handle require() for JS
404    if lang == "javascript" {
405        let re_require = Regex::new(
406            r#"(?m)(?:const|let|var)\s+(\w+)\s*=\s*require\s*\(\s*['"]([^'"]+)['"]\s*\)"#,
407        )
408        .unwrap();
409        for cap in re_require.captures_iter(source) {
410            let name = &cap[1];
411            let module = &cap[2];
412            let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
413            let import_id = make_id(&[&ps, "import", module]);
414            result.nodes.push(GraphNode {
415                id: import_id.clone(),
416                label: name.to_string(),
417                source_file: ps.clone(),
418                source_location: Some(format!("L{line}")),
419                node_type: NodeType::Module,
420                community: None,
421                extra: HashMap::new(),
422            });
423            result.edges.push(make_edge(
424                &file_id,
425                &import_id,
426                "imports",
427                path,
428                Confidence::Extracted,
429            ));
430        }
431    }
432
433    let call_edges = infer_calls(&functions, &lines, path);
434    result.edges.extend(call_edges);
435
436    result
437}
438
439// ═══════════════════════════════════════════════════════════════════════════
440// Rust
441// ═══════════════════════════════════════════════════════════════════════════
442
443fn extract_rust(path: &Path, source: &str) -> ExtractionResult {
444    let mut result = ExtractionResult::default();
445    let file_node = make_file_node(path);
446    let file_id = file_node.id.clone();
447    result.nodes.push(file_node);
448
449    let lines: Vec<&str> = source.lines().collect();
450    let ps = path_str(path);
451
452    // Structs: `pub struct Foo` / `struct Foo`
453    let re_struct = Regex::new(r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?struct\s+(\w+)").unwrap();
454    for cap in re_struct.captures_iter(source) {
455        let name = &cap[1];
456        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
457        let node = make_node(name, path, NodeType::Struct, line);
458        let node_id = node.id.clone();
459        result.nodes.push(node);
460        result.edges.push(make_edge(
461            &file_id,
462            &node_id,
463            "defines",
464            path,
465            Confidence::Extracted,
466        ));
467    }
468
469    // Enums: `pub enum Foo` / `enum Foo`
470    let re_enum = Regex::new(r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?enum\s+(\w+)").unwrap();
471    for cap in re_enum.captures_iter(source) {
472        let name = &cap[1];
473        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
474        let node = make_node(name, path, NodeType::Enum, line);
475        let node_id = node.id.clone();
476        result.nodes.push(node);
477        result.edges.push(make_edge(
478            &file_id,
479            &node_id,
480            "defines",
481            path,
482            Confidence::Extracted,
483        ));
484    }
485
486    // Traits: `pub trait Foo` / `trait Foo`
487    let re_trait = Regex::new(r"(?m)^(?:\s*pub(?:\([^)]*\))?\s+)?trait\s+(\w+)").unwrap();
488    for cap in re_trait.captures_iter(source) {
489        let name = &cap[1];
490        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
491        let node = make_node(name, path, NodeType::Trait, line);
492        let node_id = node.id.clone();
493        result.nodes.push(node);
494        result.edges.push(make_edge(
495            &file_id,
496            &node_id,
497            "defines",
498            path,
499            Confidence::Extracted,
500        ));
501    }
502
503    // Impl blocks: `impl Foo` / `impl Trait for Foo`
504    let re_impl = Regex::new(r"(?m)^(?:\s*)impl(?:<[^>]*>)?\s+(?:(\w+)\s+for\s+)?(\w+)").unwrap();
505    for cap in re_impl.captures_iter(source) {
506        let _trait_name = cap.get(1).map(|m| m.as_str());
507        let type_name = &cap[2];
508        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
509        // Create an "implements" edge if impl Trait for Type
510        if let Some(trait_m) = cap.get(1) {
511            let trait_id = make_id(&[&ps, trait_m.as_str()]);
512            let type_id = make_id(&[&ps, type_name]);
513            result.edges.push(make_edge(
514                &type_id,
515                &trait_id,
516                "implements",
517                path,
518                Confidence::Extracted,
519            ));
520        }
521        let _ = line;
522    }
523
524    // Functions: `pub fn foo(` / `fn foo(` / `pub(crate) fn foo(`
525    // Also methods inside impl blocks
526    let re_func = Regex::new(
527        r"(?m)^(\s*)(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:unsafe\s+)?(?:const\s+)?fn\s+(\w+)",
528    )
529    .unwrap();
530    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
531    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
532    for (i, cap) in func_matches.iter().enumerate() {
533        let indent = cap[1].len();
534        let name = cap[2].to_string();
535        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
536        let end_line = if i + 1 < func_matches.len() {
537            source[..func_matches[i + 1].get(0).unwrap().start()]
538                .lines()
539                .count()
540        } else {
541            lines.len()
542        };
543
544        let node_type = if indent > 0 {
545            NodeType::Method
546        } else {
547            NodeType::Function
548        };
549        let node = make_node(&name, path, node_type, start_line);
550        let node_id = node.id.clone();
551        functions.push((name, node_id.clone(), start_line, end_line));
552        result.nodes.push(node);
553        result.edges.push(make_edge(
554            &file_id,
555            &node_id,
556            "defines",
557            path,
558            Confidence::Extracted,
559        ));
560    }
561
562    // Use statements
563    let re_use = Regex::new(r"(?m)^(?:\s*)(?:pub\s+)?use\s+([\w:]+)").unwrap();
564    for cap in re_use.captures_iter(source) {
565        let module = &cap[1];
566        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
567        let import_id = make_id(&[&ps, "use", module]);
568        result.nodes.push(GraphNode {
569            id: import_id.clone(),
570            label: module.to_string(),
571            source_file: ps.clone(),
572            source_location: Some(format!("L{line}")),
573            node_type: NodeType::Module,
574            community: None,
575            extra: HashMap::new(),
576        });
577        result.edges.push(make_edge(
578            &file_id,
579            &import_id,
580            "imports",
581            path,
582            Confidence::Extracted,
583        ));
584    }
585
586    let call_edges = infer_calls(&functions, &lines, path);
587    result.edges.extend(call_edges);
588
589    result
590}
591
592// ═══════════════════════════════════════════════════════════════════════════
593// Go
594// ═══════════════════════════════════════════════════════════════════════════
595
596fn extract_go(path: &Path, source: &str) -> ExtractionResult {
597    let mut result = ExtractionResult::default();
598    let file_node = make_file_node(path);
599    let file_id = file_node.id.clone();
600    result.nodes.push(file_node);
601
602    let lines: Vec<&str> = source.lines().collect();
603    let ps = path_str(path);
604
605    // Type definitions: `type Foo struct {` / `type Foo interface {`
606    let re_type = Regex::new(r"(?m)^type\s+(\w+)\s+(struct|interface)").unwrap();
607    for cap in re_type.captures_iter(source) {
608        let name = &cap[1];
609        let kind = &cap[2];
610        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
611        let node_type = match kind {
612            "interface" => NodeType::Interface,
613            _ => NodeType::Struct,
614        };
615        let node = make_node(name, path, node_type, line);
616        let node_id = node.id.clone();
617        result.nodes.push(node);
618        result.edges.push(make_edge(
619            &file_id,
620            &node_id,
621            "defines",
622            path,
623            Confidence::Extracted,
624        ));
625    }
626
627    // Functions and methods: `func Foo(` / `func (r *Recv) Foo(`
628    let re_func = Regex::new(r"(?m)^func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(").unwrap();
629    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
630    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
631    for (i, cap) in func_matches.iter().enumerate() {
632        let name = cap[1].to_string();
633        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
634        let end_line = if i + 1 < func_matches.len() {
635            source[..func_matches[i + 1].get(0).unwrap().start()]
636                .lines()
637                .count()
638        } else {
639            lines.len()
640        };
641
642        // Methods have a receiver
643        let full_match = cap.get(0).unwrap().as_str();
644        let node_type = if full_match.contains('(') && full_match.find('(') < full_match.find(&name)
645        {
646            NodeType::Method
647        } else {
648            NodeType::Function
649        };
650
651        let node = make_node(&name, path, node_type, start_line);
652        let node_id = node.id.clone();
653        functions.push((name, node_id.clone(), start_line, end_line));
654        result.nodes.push(node);
655        result.edges.push(make_edge(
656            &file_id,
657            &node_id,
658            "defines",
659            path,
660            Confidence::Extracted,
661        ));
662    }
663
664    // Imports: `import "fmt"` / `import ( "fmt" "os" )`
665    let re_import_single = Regex::new(r#"(?m)^import\s+"([^"]+)""#).unwrap();
666    let re_import_block = Regex::new(r"(?s)import\s*\(([^)]+)\)").unwrap();
667    let re_import_line = Regex::new(r#""([^"]+)""#).unwrap();
668
669    for cap in re_import_single.captures_iter(source) {
670        let module = &cap[1];
671        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
672        let import_id = make_id(&[&ps, "import", module]);
673        result.nodes.push(GraphNode {
674            id: import_id.clone(),
675            label: module.to_string(),
676            source_file: ps.clone(),
677            source_location: Some(format!("L{line}")),
678            node_type: NodeType::Package,
679            community: None,
680            extra: HashMap::new(),
681        });
682        result.edges.push(make_edge(
683            &file_id,
684            &import_id,
685            "imports",
686            path,
687            Confidence::Extracted,
688        ));
689    }
690
691    for cap in re_import_block.captures_iter(source) {
692        let block = &cap[1];
693        let block_start = source[..cap.get(0).unwrap().start()].lines().count() + 1;
694        for (idx, imp_cap) in re_import_line.captures_iter(block).enumerate() {
695            let module = &imp_cap[1];
696            let import_id = make_id(&[&ps, "import", module]);
697            result.nodes.push(GraphNode {
698                id: import_id.clone(),
699                label: module.to_string(),
700                source_file: ps.clone(),
701                source_location: Some(format!("L{}", block_start + idx + 1)),
702                node_type: NodeType::Package,
703                community: None,
704                extra: HashMap::new(),
705            });
706            result.edges.push(make_edge(
707                &file_id,
708                &import_id,
709                "imports",
710                path,
711                Confidence::Extracted,
712            ));
713        }
714    }
715
716    let call_edges = infer_calls(&functions, &lines, path);
717    result.edges.extend(call_edges);
718
719    result
720}
721
722// ═══════════════════════════════════════════════════════════════════════════
723// Java
724// ═══════════════════════════════════════════════════════════════════════════
725
726fn extract_java(path: &Path, source: &str) -> ExtractionResult {
727    let mut result = ExtractionResult::default();
728    let file_node = make_file_node(path);
729    let file_id = file_node.id.clone();
730    result.nodes.push(file_node);
731
732    let lines: Vec<&str> = source.lines().collect();
733    let ps = path_str(path);
734
735    // Classes / interfaces / enums
736    let re_class = Regex::new(
737        r"(?m)(?:public\s+|private\s+|protected\s+)?(?:abstract\s+|static\s+|final\s+)*(class|interface|enum)\s+(\w+)",
738    )
739    .unwrap();
740    for cap in re_class.captures_iter(source) {
741        let kind = &cap[1];
742        let name = &cap[2];
743        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
744        let node_type = match kind {
745            "interface" => NodeType::Interface,
746            "enum" => NodeType::Enum,
747            _ => NodeType::Class,
748        };
749        let node = make_node(name, path, node_type, line);
750        let node_id = node.id.clone();
751        result.nodes.push(node);
752        result.edges.push(make_edge(
753            &file_id,
754            &node_id,
755            "defines",
756            path,
757            Confidence::Extracted,
758        ));
759    }
760
761    // Methods: `public void foo(` / `private static int bar(`
762    let re_method = Regex::new(
763        r"(?m)^\s+(?:public\s+|private\s+|protected\s+)?(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?(?:abstract\s+)?(?:\w+(?:<[^>]*>)?)\s+(\w+)\s*\(",
764    )
765    .unwrap();
766    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
767    let func_matches: Vec<_> = re_method.captures_iter(source).collect();
768    for (i, cap) in func_matches.iter().enumerate() {
769        let name = cap[1].to_string();
770        // Skip common false positives
771        if [
772            "if", "for", "while", "switch", "catch", "return", "new", "throw",
773        ]
774        .contains(&name.as_str())
775        {
776            continue;
777        }
778        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
779        let end_line = if i + 1 < func_matches.len() {
780            source[..func_matches[i + 1].get(0).unwrap().start()]
781                .lines()
782                .count()
783        } else {
784            lines.len()
785        };
786
787        let node = make_node(&name, path, NodeType::Method, start_line);
788        let node_id = node.id.clone();
789        functions.push((name, node_id.clone(), start_line, end_line));
790        result.nodes.push(node);
791        result.edges.push(make_edge(
792            &file_id,
793            &node_id,
794            "defines",
795            path,
796            Confidence::Extracted,
797        ));
798    }
799
800    // Imports
801    let re_import = Regex::new(r"(?m)^import\s+(?:static\s+)?([\w.]+)\s*;").unwrap();
802    for cap in re_import.captures_iter(source) {
803        let module = &cap[1];
804        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
805        let import_id = make_id(&[&ps, "import", module]);
806        result.nodes.push(GraphNode {
807            id: import_id.clone(),
808            label: module.to_string(),
809            source_file: ps.clone(),
810            source_location: Some(format!("L{line}")),
811            node_type: NodeType::Package,
812            community: None,
813            extra: HashMap::new(),
814        });
815        result.edges.push(make_edge(
816            &file_id,
817            &import_id,
818            "imports",
819            path,
820            Confidence::Extracted,
821        ));
822    }
823
824    let call_edges = infer_calls(&functions, &lines, path);
825    result.edges.extend(call_edges);
826
827    result
828}
829
830// ═══════════════════════════════════════════════════════════════════════════
831// C / C++
832// ═══════════════════════════════════════════════════════════════════════════
833
834fn extract_c_cpp(path: &Path, source: &str, lang: &str) -> ExtractionResult {
835    let mut result = ExtractionResult::default();
836    let file_node = make_file_node(path);
837    let file_id = file_node.id.clone();
838    result.nodes.push(file_node);
839
840    let lines: Vec<&str> = source.lines().collect();
841    let ps = path_str(path);
842
843    // #include directives
844    let re_include = Regex::new(r#"(?m)^#include\s+[<"]([^>"]+)[>"]"#).unwrap();
845    for cap in re_include.captures_iter(source) {
846        let header = &cap[1];
847        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
848        let import_id = make_id(&[&ps, "include", header]);
849        result.nodes.push(GraphNode {
850            id: import_id.clone(),
851            label: header.to_string(),
852            source_file: ps.clone(),
853            source_location: Some(format!("L{line}")),
854            node_type: NodeType::Module,
855            community: None,
856            extra: HashMap::new(),
857        });
858        result.edges.push(make_edge(
859            &file_id,
860            &import_id,
861            "includes",
862            path,
863            Confidence::Extracted,
864        ));
865    }
866
867    // C++ classes / structs / namespaces
868    if lang == "cpp" {
869        let re_class = Regex::new(r"(?m)^(?:\s*)(?:class|struct|namespace)\s+(\w+)").unwrap();
870        for cap in re_class.captures_iter(source) {
871            let name = &cap[1];
872            let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
873            let node = make_node(name, path, NodeType::Class, line);
874            let node_id = node.id.clone();
875            result.nodes.push(node);
876            result.edges.push(make_edge(
877                &file_id,
878                &node_id,
879                "defines",
880                path,
881                Confidence::Extracted,
882            ));
883        }
884    }
885
886    // C structs
887    if lang == "c" {
888        let re_struct = Regex::new(r"(?m)^(?:typedef\s+)?struct\s+(\w+)").unwrap();
889        for cap in re_struct.captures_iter(source) {
890            let name = &cap[1];
891            let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
892            let node = make_node(name, path, NodeType::Struct, line);
893            let node_id = node.id.clone();
894            result.nodes.push(node);
895            result.edges.push(make_edge(
896                &file_id,
897                &node_id,
898                "defines",
899                path,
900                Confidence::Extracted,
901            ));
902        }
903    }
904
905    // Functions: `type name(` at start of line (heuristic)
906    let re_func = Regex::new(
907        r"(?m)^(?:static\s+)?(?:inline\s+)?(?:extern\s+)?(?:const\s+)?(?:unsigned\s+)?(?:signed\s+)?(?:\w+(?:\s*\*\s*|\s+))(\w+)\s*\([^;]*\)\s*\{",
908    )
909    .unwrap();
910    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
911    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
912    for (i, cap) in func_matches.iter().enumerate() {
913        let name = cap[1].to_string();
914        if ["if", "for", "while", "switch", "return", "sizeof"].contains(&name.as_str()) {
915            continue;
916        }
917        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
918        let end_line = if i + 1 < func_matches.len() {
919            source[..func_matches[i + 1].get(0).unwrap().start()]
920                .lines()
921                .count()
922        } else {
923            lines.len()
924        };
925
926        let node = make_node(&name, path, NodeType::Function, start_line);
927        let node_id = node.id.clone();
928        functions.push((name, node_id.clone(), start_line, end_line));
929        result.nodes.push(node);
930        result.edges.push(make_edge(
931            &file_id,
932            &node_id,
933            "defines",
934            path,
935            Confidence::Extracted,
936        ));
937    }
938
939    let call_edges = infer_calls(&functions, &lines, path);
940    result.edges.extend(call_edges);
941
942    result
943}
944
945// ═══════════════════════════════════════════════════════════════════════════
946// Ruby
947// ═══════════════════════════════════════════════════════════════════════════
948
949fn extract_ruby(path: &Path, source: &str) -> ExtractionResult {
950    let mut result = ExtractionResult::default();
951    let file_node = make_file_node(path);
952    let file_id = file_node.id.clone();
953    result.nodes.push(file_node);
954
955    let lines: Vec<&str> = source.lines().collect();
956    let ps = path_str(path);
957
958    // Classes and modules
959    let re_class = Regex::new(r"(?m)^\s*(class|module)\s+(\w+(?:::\w+)*)").unwrap();
960    for cap in re_class.captures_iter(source) {
961        let name = &cap[2];
962        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
963        let node = make_node(name, path, NodeType::Class, line);
964        let node_id = node.id.clone();
965        result.nodes.push(node);
966        result.edges.push(make_edge(
967            &file_id,
968            &node_id,
969            "defines",
970            path,
971            Confidence::Extracted,
972        ));
973    }
974
975    // Methods
976    let re_func = Regex::new(r"(?m)^\s*def\s+(self\.)?(\w+[?!=]?)").unwrap();
977    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
978    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
979    for (i, cap) in func_matches.iter().enumerate() {
980        let name = cap[2].to_string();
981        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
982        let end_line = if i + 1 < func_matches.len() {
983            source[..func_matches[i + 1].get(0).unwrap().start()]
984                .lines()
985                .count()
986        } else {
987            lines.len()
988        };
989
990        let node = make_node(&name, path, NodeType::Method, start_line);
991        let node_id = node.id.clone();
992        functions.push((name, node_id.clone(), start_line, end_line));
993        result.nodes.push(node);
994        result.edges.push(make_edge(
995            &file_id,
996            &node_id,
997            "defines",
998            path,
999            Confidence::Extracted,
1000        ));
1001    }
1002
1003    // require / require_relative
1004    let re_require = Regex::new(r#"(?m)^\s*require(?:_relative)?\s+['"]([^'"]+)['"]"#).unwrap();
1005    for cap in re_require.captures_iter(source) {
1006        let module = &cap[1];
1007        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1008        let import_id = make_id(&[&ps, "require", module]);
1009        result.nodes.push(GraphNode {
1010            id: import_id.clone(),
1011            label: module.to_string(),
1012            source_file: ps.clone(),
1013            source_location: Some(format!("L{line}")),
1014            node_type: NodeType::Module,
1015            community: None,
1016            extra: HashMap::new(),
1017        });
1018        result.edges.push(make_edge(
1019            &file_id,
1020            &import_id,
1021            "imports",
1022            path,
1023            Confidence::Extracted,
1024        ));
1025    }
1026
1027    let call_edges = infer_calls(&functions, &lines, path);
1028    result.edges.extend(call_edges);
1029
1030    result
1031}
1032
1033// ═══════════════════════════════════════════════════════════════════════════
1034// C#
1035// ═══════════════════════════════════════════════════════════════════════════
1036
1037fn extract_csharp(path: &Path, source: &str) -> ExtractionResult {
1038    let mut result = ExtractionResult::default();
1039    let file_node = make_file_node(path);
1040    let file_id = file_node.id.clone();
1041    result.nodes.push(file_node);
1042
1043    let lines: Vec<&str> = source.lines().collect();
1044    let ps = path_str(path);
1045
1046    // Classes / interfaces / structs / enums
1047    let re_class = Regex::new(
1048        r"(?m)(?:public\s+|private\s+|protected\s+|internal\s+)?(?:abstract\s+|static\s+|sealed\s+|partial\s+)*(class|interface|struct|enum)\s+(\w+)",
1049    )
1050    .unwrap();
1051    for cap in re_class.captures_iter(source) {
1052        let kind = &cap[1];
1053        let name = &cap[2];
1054        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1055        let node_type = match kind {
1056            "interface" => NodeType::Interface,
1057            "struct" => NodeType::Struct,
1058            "enum" => NodeType::Enum,
1059            _ => NodeType::Class,
1060        };
1061        let node = make_node(name, path, node_type, line);
1062        let node_id = node.id.clone();
1063        result.nodes.push(node);
1064        result.edges.push(make_edge(
1065            &file_id,
1066            &node_id,
1067            "defines",
1068            path,
1069            Confidence::Extracted,
1070        ));
1071    }
1072
1073    // Methods
1074    let re_method = Regex::new(
1075        r"(?m)^\s+(?:public\s+|private\s+|protected\s+|internal\s+)?(?:static\s+)?(?:virtual\s+)?(?:override\s+)?(?:async\s+)?(?:\w+(?:<[^>]*>)?)\s+(\w+)\s*\(",
1076    )
1077    .unwrap();
1078    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
1079    let func_matches: Vec<_> = re_method.captures_iter(source).collect();
1080    for (i, cap) in func_matches.iter().enumerate() {
1081        let name = cap[1].to_string();
1082        if [
1083            "if", "for", "while", "switch", "catch", "return", "new", "throw",
1084        ]
1085        .contains(&name.as_str())
1086        {
1087            continue;
1088        }
1089        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1090        let end_line = if i + 1 < func_matches.len() {
1091            source[..func_matches[i + 1].get(0).unwrap().start()]
1092                .lines()
1093                .count()
1094        } else {
1095            lines.len()
1096        };
1097
1098        let node = make_node(&name, path, NodeType::Method, start_line);
1099        let node_id = node.id.clone();
1100        functions.push((name, node_id.clone(), start_line, end_line));
1101        result.nodes.push(node);
1102        result.edges.push(make_edge(
1103            &file_id,
1104            &node_id,
1105            "defines",
1106            path,
1107            Confidence::Extracted,
1108        ));
1109    }
1110
1111    // using directives
1112    let re_using = Regex::new(r"(?m)^using\s+([\w.]+)\s*;").unwrap();
1113    for cap in re_using.captures_iter(source) {
1114        let ns = &cap[1];
1115        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1116        let import_id = make_id(&[&ps, "using", ns]);
1117        result.nodes.push(GraphNode {
1118            id: import_id.clone(),
1119            label: ns.to_string(),
1120            source_file: ps.clone(),
1121            source_location: Some(format!("L{line}")),
1122            node_type: NodeType::Namespace,
1123            community: None,
1124            extra: HashMap::new(),
1125        });
1126        result.edges.push(make_edge(
1127            &file_id,
1128            &import_id,
1129            "imports",
1130            path,
1131            Confidence::Extracted,
1132        ));
1133    }
1134
1135    let call_edges = infer_calls(&functions, &lines, path);
1136    result.edges.extend(call_edges);
1137
1138    result
1139}
1140
1141// ═══════════════════════════════════════════════════════════════════════════
1142// Kotlin
1143// ═══════════════════════════════════════════════════════════════════════════
1144
1145fn extract_kotlin(path: &Path, source: &str) -> ExtractionResult {
1146    let mut result = ExtractionResult::default();
1147    let file_node = make_file_node(path);
1148    let file_id = file_node.id.clone();
1149    result.nodes.push(file_node);
1150
1151    let lines: Vec<&str> = source.lines().collect();
1152    let ps = path_str(path);
1153
1154    // Classes / objects / interfaces
1155    let re_class = Regex::new(
1156        r"(?m)(?:open\s+|abstract\s+|data\s+|sealed\s+)?(?:class|object|interface)\s+(\w+)",
1157    )
1158    .unwrap();
1159    for cap in re_class.captures_iter(source) {
1160        let name = &cap[1];
1161        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1162        let node = make_node(name, path, NodeType::Class, line);
1163        let node_id = node.id.clone();
1164        result.nodes.push(node);
1165        result.edges.push(make_edge(
1166            &file_id,
1167            &node_id,
1168            "defines",
1169            path,
1170            Confidence::Extracted,
1171        ));
1172    }
1173
1174    // Functions: `fun foo(`
1175    let re_func = Regex::new(r"(?m)^\s*(?:(?:private|public|protected|internal|override|open|suspend)\s+)*fun\s+(?:<[^>]+>\s+)?(\w+)\s*\(").unwrap();
1176    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
1177    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
1178    for (i, cap) in func_matches.iter().enumerate() {
1179        let name = cap[1].to_string();
1180        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1181        let end_line = if i + 1 < func_matches.len() {
1182            source[..func_matches[i + 1].get(0).unwrap().start()]
1183                .lines()
1184                .count()
1185        } else {
1186            lines.len()
1187        };
1188
1189        let node = make_node(&name, path, NodeType::Function, start_line);
1190        let node_id = node.id.clone();
1191        functions.push((name, node_id.clone(), start_line, end_line));
1192        result.nodes.push(node);
1193        result.edges.push(make_edge(
1194            &file_id,
1195            &node_id,
1196            "defines",
1197            path,
1198            Confidence::Extracted,
1199        ));
1200    }
1201
1202    // Imports
1203    let re_import = Regex::new(r"(?m)^import\s+([\w.]+)").unwrap();
1204    for cap in re_import.captures_iter(source) {
1205        let module = &cap[1];
1206        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1207        let import_id = make_id(&[&ps, "import", module]);
1208        result.nodes.push(GraphNode {
1209            id: import_id.clone(),
1210            label: module.to_string(),
1211            source_file: ps.clone(),
1212            source_location: Some(format!("L{line}")),
1213            node_type: NodeType::Package,
1214            community: None,
1215            extra: HashMap::new(),
1216        });
1217        result.edges.push(make_edge(
1218            &file_id,
1219            &import_id,
1220            "imports",
1221            path,
1222            Confidence::Extracted,
1223        ));
1224    }
1225
1226    let call_edges = infer_calls(&functions, &lines, path);
1227    result.edges.extend(call_edges);
1228
1229    result
1230}
1231
1232// ═══════════════════════════════════════════════════════════════════════════
1233// Generic fallback (Scala, PHP, Swift, Lua, Zig, PowerShell, Elixir, ObjC, Julia)
1234// ═══════════════════════════════════════════════════════════════════════════
1235
1236fn extract_generic(path: &Path, source: &str, _lang: &str) -> ExtractionResult {
1237    let mut result = ExtractionResult::default();
1238    let file_node = make_file_node(path);
1239    let file_id = file_node.id.clone();
1240    result.nodes.push(file_node);
1241
1242    let lines: Vec<&str> = source.lines().collect();
1243    let ps = path_str(path);
1244
1245    // Generic class/struct/module pattern
1246    let re_class =
1247        Regex::new(r"(?m)^\s*(?:(?:pub|public|private|protected|internal|open|abstract|sealed|partial|static|final|export)\s+)*(?:class|struct|module|object|interface|trait|protocol|enum|defmodule)\s+(\w+(?:::\w+)*)")
1248            .unwrap();
1249    for cap in re_class.captures_iter(source) {
1250        let name = &cap[1];
1251        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1252        let node = make_node(name, path, NodeType::Class, line);
1253        let node_id = node.id.clone();
1254        result.nodes.push(node);
1255        result.edges.push(make_edge(
1256            &file_id,
1257            &node_id,
1258            "defines",
1259            path,
1260            Confidence::Extracted,
1261        ));
1262    }
1263
1264    // Generic function pattern
1265    let re_func = Regex::new(
1266        r"(?m)^\s*(?:(?:pub|public|private|protected|internal|open|override|suspend|static|async|export|def|defp)\s+)*(?:func|function|fn|def|defp|fun|sub)\s+(\w+[?!]?)\s*[\(<]",
1267    )
1268    .unwrap();
1269    let mut functions: Vec<(String, String, usize, usize)> = Vec::new();
1270    let func_matches: Vec<_> = re_func.captures_iter(source).collect();
1271    for (i, cap) in func_matches.iter().enumerate() {
1272        let name = cap[1].to_string();
1273        let start_line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1274        let end_line = if i + 1 < func_matches.len() {
1275            source[..func_matches[i + 1].get(0).unwrap().start()]
1276                .lines()
1277                .count()
1278        } else {
1279            lines.len()
1280        };
1281
1282        let node = make_node(&name, path, NodeType::Function, start_line);
1283        let node_id = node.id.clone();
1284        functions.push((name, node_id.clone(), start_line, end_line));
1285        result.nodes.push(node);
1286        result.edges.push(make_edge(
1287            &file_id,
1288            &node_id,
1289            "defines",
1290            path,
1291            Confidence::Extracted,
1292        ));
1293    }
1294
1295    // Generic import pattern
1296    let re_import =
1297        Regex::new(r#"(?m)^\s*(?:import|use|using|require|include|from)\s+['"]?([\w./:-]+)['"]?"#)
1298            .unwrap();
1299    for cap in re_import.captures_iter(source) {
1300        let module = &cap[1];
1301        let line = source[..cap.get(0).unwrap().start()].lines().count() + 1;
1302        let import_id = make_id(&[&ps, "import", module]);
1303        result.nodes.push(GraphNode {
1304            id: import_id.clone(),
1305            label: module.to_string(),
1306            source_file: ps.clone(),
1307            source_location: Some(format!("L{line}")),
1308            node_type: NodeType::Module,
1309            community: None,
1310            extra: HashMap::new(),
1311        });
1312        result.edges.push(make_edge(
1313            &file_id,
1314            &import_id,
1315            "imports",
1316            path,
1317            Confidence::Extracted,
1318        ));
1319    }
1320
1321    let call_edges = infer_calls(&functions, &lines, path);
1322    result.edges.extend(call_edges);
1323
1324    result
1325}
1326
1327// Tests moved to tests/ast_extract.rs (integration tests)