Skip to main content

open_kioku_parse/
lib.rs

1use chrono::Utc;
2use open_kioku_core::{
3    AnalysisFact, CodeChunk, Confidence, EvidenceSourceType, File, GraphEdgeType, GraphNodeType,
4    Import, Language, LineRange, ScoreComponent, Symbol, SymbolId, SymbolKind, TestTarget,
5};
6use regex::Regex;
7use sha2::{Digest, Sha256};
8use std::collections::HashSet;
9
10#[derive(Debug, Clone)]
11pub struct ParsedFile {
12    pub chunks: Vec<CodeChunk>,
13    pub symbols: Vec<Symbol>,
14    pub imports: Vec<Import>,
15    pub analysis_facts: Vec<AnalysisFact>,
16    pub tests: Vec<TestTarget>,
17}
18
19pub trait Parser: Send + Sync {
20    fn parse(&self, file: &File, content: &str) -> ParsedFile {
21        self.parse_with_hint(file, content, None)
22    }
23    fn parse_with_hint(&self, file: &File, content: &str, build_hint: Option<&str>) -> ParsedFile;
24}
25
26#[derive(Default)]
27pub struct HeuristicParser;
28
29impl Parser for HeuristicParser {
30    fn parse_with_hint(&self, file: &File, content: &str, build_hint: Option<&str>) -> ParsedFile {
31        let imports = extract_imports(file, content);
32        let mut symbols = extract_symbols(file, content);
33        dedupe_symbols(&mut symbols);
34        let analysis_facts = extract_analysis_facts(file, content, &symbols);
35        let mut chunks = extract_chunks(file, content, &symbols);
36        dedupe_chunks(&mut chunks);
37        let tests = extract_tests(file, content, &symbols, build_hint);
38        ParsedFile {
39            chunks,
40            symbols,
41            imports,
42            analysis_facts,
43            tests,
44        }
45    }
46}
47
48fn dedupe_symbols(symbols: &mut Vec<Symbol>) {
49    let mut seen = HashSet::new();
50    symbols.retain(|symbol| seen.insert(symbol.id.clone()));
51}
52
53fn dedupe_chunks(chunks: &mut Vec<CodeChunk>) {
54    let mut seen = HashSet::new();
55    chunks.retain(|chunk| seen.insert(chunk.id.clone()));
56}
57
58pub fn extract_symbols(file: &File, content: &str) -> Vec<Symbol> {
59    if let Ok(symbols) = open_kioku_tree_sitter::parse_symbols(file, content) {
60        if !symbols.is_empty() {
61            return symbols;
62        }
63    }
64    match file.language {
65        Language::Rust => extract_with_patterns(
66            file,
67            content,
68            &[
69                (
70                    r"^\s*(pub\s+)?(async\s+)?fn\s+([A-Za-z_][A-Za-z0-9_]*)",
71                    SymbolKind::Function,
72                    3,
73                ),
74                (
75                    r"^\s*(pub\s+)?struct\s+([A-Za-z_][A-Za-z0-9_]*)",
76                    SymbolKind::Class,
77                    2,
78                ),
79                (
80                    r"^\s*(pub\s+)?enum\s+([A-Za-z_][A-Za-z0-9_]*)",
81                    SymbolKind::Class,
82                    2,
83                ),
84                (
85                    r"^\s*(pub\s+)?trait\s+([A-Za-z_][A-Za-z0-9_]*)",
86                    SymbolKind::Trait,
87                    2,
88                ),
89                (r"^\s*mod\s+([A-Za-z_][A-Za-z0-9_]*)", SymbolKind::Module, 1),
90            ],
91        ),
92        Language::Java => extract_with_patterns(
93            file,
94            content,
95            &[
96                (
97                    r"\b(class|record)\s+([A-Za-z_][A-Za-z0-9_]*)",
98                    SymbolKind::Class,
99                    2,
100                ),
101                (
102                    r"\binterface\s+([A-Za-z_][A-Za-z0-9_]*)",
103                    SymbolKind::Interface,
104                    1,
105                ),
106                (
107                    r"\b(?:public|private|protected)?\s*(?:static\s+)?[A-Za-z0-9_<>\[\], ?]+\s+([A-Za-z_][A-Za-z0-9_]*)\s*\(",
108                    SymbolKind::Method,
109                    1,
110                ),
111            ],
112        ),
113        Language::TypeScript | Language::JavaScript => extract_with_patterns(
114            file,
115            content,
116            &[
117                (
118                    r"\bfunction\s+([A-Za-z_$][A-Za-z0-9_$]*)",
119                    SymbolKind::Function,
120                    1,
121                ),
122                (
123                    r"\bclass\s+([A-Za-z_$][A-Za-z0-9_$]*)",
124                    SymbolKind::Class,
125                    1,
126                ),
127                (
128                    r"\binterface\s+([A-Za-z_$][A-Za-z0-9_$]*)",
129                    SymbolKind::Interface,
130                    1,
131                ),
132                (
133                    r"\b(?:const|let|var)\s+([A-Za-z_$][A-Za-z0-9_$]*)\s*=\s*(?:async\s*)?\(",
134                    SymbolKind::Function,
135                    1,
136                ),
137                (
138                    r"\bexport\s+(?:const|let|var)\s+([A-Za-z_$][A-Za-z0-9_$]*)",
139                    SymbolKind::Variable,
140                    1,
141                ),
142            ],
143        ),
144        Language::Python => extract_with_patterns(
145            file,
146            content,
147            &[
148                (
149                    r"^\s*def\s+([A-Za-z_][A-Za-z0-9_]*)",
150                    SymbolKind::Function,
151                    1,
152                ),
153                (
154                    r"^\s*async\s+def\s+([A-Za-z_][A-Za-z0-9_]*)",
155                    SymbolKind::Function,
156                    1,
157                ),
158                (
159                    r"^\s*class\s+([A-Za-z_][A-Za-z0-9_]*)",
160                    SymbolKind::Class,
161                    1,
162                ),
163            ],
164        ),
165        Language::Go => extract_with_patterns(
166            file,
167            content,
168            &[
169                (
170                    r"^\s*func\s+(?:\([^)]+\)\s*)?([A-Za-z_][A-Za-z0-9_]*)",
171                    SymbolKind::Function,
172                    1,
173                ),
174                (
175                    r"^\s*type\s+([A-Za-z_][A-Za-z0-9_]*)\s+struct",
176                    SymbolKind::Class,
177                    1,
178                ),
179                (
180                    r"^\s*type\s+([A-Za-z_][A-Za-z0-9_]*)\s+interface",
181                    SymbolKind::Interface,
182                    1,
183                ),
184            ],
185        ),
186        Language::Sql => extract_with_patterns(
187            file,
188            content,
189            &[(
190                r"(?i)^\s*create\s+table\s+([A-Za-z_][A-Za-z0-9_\.]*)",
191                SymbolKind::DatabaseTable,
192                1,
193            )],
194        ),
195        _ => Vec::new(),
196    }
197}
198
199fn extract_with_patterns(
200    file: &File,
201    content: &str,
202    specs: &[(&str, SymbolKind, usize)],
203) -> Vec<Symbol> {
204    let compiled = specs
205        .iter()
206        .filter_map(|(pattern, kind, capture)| {
207            Regex::new(pattern)
208                .ok()
209                .map(|re| (re, kind.clone(), *capture))
210        })
211        .collect::<Vec<_>>();
212    let mut symbols = Vec::new();
213    for (idx, line) in content.lines().enumerate() {
214        for (regex, kind, capture) in &compiled {
215            if let Some(captures) = regex.captures(line) {
216                if let Some(name) = captures.get(*capture) {
217                    let line_number = (idx + 1) as u32;
218                    let qualified_name = qualified_name(file, name.as_str());
219                    symbols.push(Symbol {
220                        id: SymbolId::new(stable_id(&format!(
221                            "{}:{}:{}",
222                            file.path.display(),
223                            line_number,
224                            qualified_name
225                        ))),
226                        name: name.as_str().to_string(),
227                        qualified_name,
228                        kind: kind.clone(),
229                        file_id: file.id.clone(),
230                        range: Some(LineRange::single(line_number)),
231                        language: file.language.clone(),
232                        confidence: Confidence::Medium,
233                        provenance: EvidenceSourceType::Heuristic,
234                    });
235                }
236            }
237        }
238    }
239    symbols
240}
241
242pub fn extract_imports(file: &File, content: &str) -> Vec<Import> {
243    let patterns = match file.language {
244        Language::Rust => vec![r"^\s*use\s+([^;]+)", r"^\s*mod\s+([A-Za-z_][A-Za-z0-9_]*)"],
245        Language::Java => vec![r"^\s*import\s+([^;]+)"],
246        Language::TypeScript | Language::JavaScript => {
247            vec![r#"from\s+["']([^"']+)["']"#, r#"import\s+["']([^"']+)["']"#]
248        }
249        Language::Python => vec![
250            r"^\s*import\s+([A-Za-z0-9_\.]+)",
251            r"^\s*from\s+([A-Za-z0-9_\.]+)\s+import",
252        ],
253        Language::Go => vec![r#"^\s*import\s+"([^"]+)""#],
254        _ => Vec::new(),
255    };
256    let compiled = patterns
257        .iter()
258        .filter_map(|pattern| Regex::new(pattern).ok())
259        .collect::<Vec<_>>();
260    let mut imports = Vec::new();
261    for (idx, line) in content.lines().enumerate() {
262        for regex in &compiled {
263            if let Some(captures) = regex.captures(line) {
264                if let Some(value) = captures.get(1) {
265                    imports.push(Import {
266                        file_id: file.id.clone(),
267                        imported: value.as_str().trim().to_string(),
268                        range: Some(LineRange::single((idx + 1) as u32)),
269                        confidence: Confidence::Medium,
270                    });
271                }
272            }
273        }
274    }
275    imports
276}
277
278pub fn extract_analysis_facts(file: &File, content: &str, symbols: &[Symbol]) -> Vec<AnalysisFact> {
279    match file.language {
280        Language::Java => extract_java_analysis_facts(file, content, symbols),
281        Language::TypeScript | Language::JavaScript => {
282            extract_javascript_analysis_facts(file, content, symbols)
283        }
284        Language::Python => extract_python_analysis_facts(file, content, symbols),
285        Language::Rust => extract_rust_analysis_facts(file, content, symbols),
286        _ => Vec::new(),
287    }
288}
289
290fn extract_java_analysis_facts(
291    file: &File,
292    content: &str,
293    symbols: &[Symbol],
294) -> Vec<AnalysisFact> {
295    let mut facts = Vec::new();
296    let class_re = Regex::new(
297        r"\b(?:class|record|enum)\s+([A-Za-z_][A-Za-z0-9_]*)(?:\s+extends\s+([A-Za-z0-9_.$<>]+))?(?:\s+implements\s+([A-Za-z0-9_.$<>,\s]+))?",
298    )
299    .expect("valid Java class regex");
300    let interface_re = Regex::new(
301        r"\binterface\s+([A-Za-z_][A-Za-z0-9_]*)(?:\s+extends\s+([A-Za-z0-9_.$<>,\s]+))?",
302    )
303    .expect("valid Java interface regex");
304    let mapping_re = Regex::new(
305        r#"@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)(?:\s*\(\s*(?:value\s*=\s*)?["']([^"']+)["'])?"#,
306    )
307    .expect("valid Spring mapping regex");
308    let env_re =
309        Regex::new(r#"System\.getenv\(\s*["']([^"']+)["']\s*\)"#).expect("valid getenv regex");
310    let value_re = Regex::new(r#"@Value\(\s*["']\$\{([^}:]+)(?::[^}]*)?\}["']\s*\)"#)
311        .expect("valid Spring value regex");
312    let table_re =
313        Regex::new(r#"@Table\(\s*name\s*=\s*["']([^"']+)["']"#).expect("valid table regex");
314
315    for (idx, line) in content.lines().enumerate() {
316        let line_number = (idx + 1) as u32;
317        if let Some(captures) = class_re.captures(line) {
318            let source = captures.get(1).map(|value| value.as_str());
319            let source_symbol = source.and_then(|name| symbol_named(symbols, name));
320            if let Some(base) = captures.get(2) {
321                facts.push(analysis_fact(
322                    file,
323                    source_symbol,
324                    GraphEdgeType::Extends,
325                    GraphNodeType::Class,
326                    clean_java_type(base.as_str()),
327                    line_number,
328                    ("open-kioku-static/java", "Java class inheritance"),
329                ));
330            }
331            if let Some(interfaces) = captures.get(3) {
332                for interface in split_java_types(interfaces.as_str()) {
333                    facts.push(analysis_fact(
334                        file,
335                        source_symbol,
336                        GraphEdgeType::Implements,
337                        GraphNodeType::Interface,
338                        interface,
339                        line_number,
340                        ("open-kioku-static/java", "Java implemented interface"),
341                    ));
342                }
343            }
344        }
345        if let Some(captures) = interface_re.captures(line) {
346            let source = captures.get(1).map(|value| value.as_str());
347            let source_symbol = source.and_then(|name| symbol_named(symbols, name));
348            if let Some(parents) = captures.get(2) {
349                for parent in split_java_types(parents.as_str()) {
350                    facts.push(analysis_fact(
351                        file,
352                        source_symbol,
353                        GraphEdgeType::Extends,
354                        GraphNodeType::Interface,
355                        parent,
356                        line_number,
357                        ("open-kioku-static/java", "Java interface inheritance"),
358                    ));
359                }
360            }
361        }
362        if let Some(captures) = mapping_re.captures(line) {
363            let method = spring_http_method(captures.get(1).map(|value| value.as_str()));
364            let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
365            let source_symbol = symbol_at_or_after(symbols, line_number, 4);
366            facts.push(analysis_fact(
367                file,
368                source_symbol,
369                GraphEdgeType::ExposesEndpoint,
370                GraphNodeType::Endpoint,
371                format!("{method} {route}"),
372                line_number,
373                ("open-kioku-static/java", "Spring MVC endpoint mapping"),
374            ));
375        }
376        for captures in env_re.captures_iter(line) {
377            if let Some(key) = captures.get(1) {
378                facts.push(analysis_fact(
379                    file,
380                    symbol_at_or_before(symbols, line_number),
381                    GraphEdgeType::ReadsConfig,
382                    GraphNodeType::ConfigKey,
383                    key.as_str().to_string(),
384                    line_number,
385                    ("open-kioku-static/java", "Java environment variable read"),
386                ));
387            }
388        }
389        if let Some(captures) = value_re.captures(line) {
390            if let Some(key) = captures.get(1) {
391                facts.push(analysis_fact(
392                    file,
393                    symbol_at_or_after(symbols, line_number, 3),
394                    GraphEdgeType::ReadsConfig,
395                    GraphNodeType::ConfigKey,
396                    key.as_str().to_string(),
397                    line_number,
398                    ("open-kioku-static/java", "Spring configuration value read"),
399                ));
400            }
401        }
402        if let Some(captures) = table_re.captures(line) {
403            if let Some(table) = captures.get(1) {
404                facts.push(analysis_fact(
405                    file,
406                    symbol_at_or_after(symbols, line_number, 3),
407                    GraphEdgeType::ReadsTable,
408                    GraphNodeType::DatabaseTable,
409                    table.as_str().to_string(),
410                    line_number,
411                    ("open-kioku-static/java", "JPA table mapping"),
412                ));
413            }
414        }
415    }
416    dedupe_analysis_facts(&mut facts);
417    facts
418}
419
420fn extract_javascript_analysis_facts(
421    file: &File,
422    content: &str,
423    symbols: &[Symbol],
424) -> Vec<AnalysisFact> {
425    let mut facts = Vec::new();
426    let route_re =
427        Regex::new(r#"\b(?:app|router)\.(get|post|put|delete|patch|all)\(\s*["']([^"']+)["']"#)
428            .expect("valid JavaScript route regex");
429    for (idx, line) in content.lines().enumerate() {
430        let line_number = (idx + 1) as u32;
431        for captures in route_re.captures_iter(line) {
432            let method = captures
433                .get(1)
434                .map(|value| value.as_str().to_ascii_uppercase())
435                .unwrap_or_else(|| "HTTP".into());
436            let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
437            facts.push(analysis_fact(
438                file,
439                symbol_at_or_before(symbols, line_number),
440                GraphEdgeType::ExposesEndpoint,
441                GraphNodeType::Endpoint,
442                format!("{method} {route}"),
443                line_number,
444                ("open-kioku-static/javascript", "JavaScript HTTP route"),
445            ));
446        }
447    }
448    facts
449}
450
451fn extract_python_analysis_facts(
452    file: &File,
453    content: &str,
454    symbols: &[Symbol],
455) -> Vec<AnalysisFact> {
456    let mut facts = Vec::new();
457    let route_re = Regex::new(
458        r#"@(?:app|router|blueprint)\.(get|post|put|delete|patch|route)\(\s*["']([^"']+)["']"#,
459    )
460    .expect("valid Python route regex");
461    for (idx, line) in content.lines().enumerate() {
462        let line_number = (idx + 1) as u32;
463        for captures in route_re.captures_iter(line) {
464            let method = match captures.get(1).map(|value| value.as_str()) {
465                Some("route") => "HTTP".to_string(),
466                Some(value) => value.to_ascii_uppercase(),
467                None => "HTTP".into(),
468            };
469            let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
470            facts.push(analysis_fact(
471                file,
472                symbol_at_or_after(symbols, line_number, 2),
473                GraphEdgeType::ExposesEndpoint,
474                GraphNodeType::Endpoint,
475                format!("{method} {route}"),
476                line_number,
477                ("open-kioku-static/python", "Python HTTP route decorator"),
478            ));
479        }
480    }
481    facts
482}
483
484fn extract_rust_analysis_facts(
485    file: &File,
486    content: &str,
487    symbols: &[Symbol],
488) -> Vec<AnalysisFact> {
489    let mut facts = Vec::new();
490    let route_re = Regex::new(r#"#\[(get|post|put|delete|patch)\(\s*["']([^"']+)["']\s*\)\]"#)
491        .expect("valid Rust route regex");
492    for (idx, line) in content.lines().enumerate() {
493        let line_number = (idx + 1) as u32;
494        for captures in route_re.captures_iter(line) {
495            let method = captures
496                .get(1)
497                .map(|value| value.as_str().to_ascii_uppercase())
498                .unwrap_or_else(|| "HTTP".into());
499            let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
500            facts.push(analysis_fact(
501                file,
502                symbol_at_or_after(symbols, line_number, 2),
503                GraphEdgeType::ExposesEndpoint,
504                GraphNodeType::Endpoint,
505                format!("{method} {route}"),
506                line_number,
507                ("open-kioku-static/rust", "Rust HTTP route attribute"),
508            ));
509        }
510    }
511    facts
512}
513
514fn analysis_fact(
515    file: &File,
516    symbol: Option<&Symbol>,
517    edge_type: GraphEdgeType,
518    target_kind: GraphNodeType,
519    target: String,
520    line_number: u32,
521    source: (&str, &str),
522) -> AnalysisFact {
523    AnalysisFact {
524        id: stable_id(&format!(
525            "analysis:{}:{}:{:?}:{}:{}",
526            file.path.display(),
527            symbol
528                .map(|symbol| symbol.id.0.as_str())
529                .unwrap_or("<file>"),
530            edge_type,
531            target,
532            line_number
533        )),
534        file_id: file.id.clone(),
535        symbol_id: symbol.map(|symbol| symbol.id.clone()),
536        target,
537        target_kind,
538        edge_type,
539        range: Some(LineRange::single(line_number)),
540        confidence: Confidence::Medium,
541        source: source.0.into(),
542        source_type: EvidenceSourceType::StaticAnalysis,
543        message: source.1.into(),
544    }
545}
546
547fn symbol_named<'a>(symbols: &'a [Symbol], name: &str) -> Option<&'a Symbol> {
548    symbols.iter().find(|symbol| symbol.name == name)
549}
550
551fn symbol_at_or_after(symbols: &[Symbol], line_number: u32, max_distance: u32) -> Option<&Symbol> {
552    symbols
553        .iter()
554        .filter_map(|symbol| {
555            let start = symbol.range.as_ref()?.start;
556            (start >= line_number && start <= line_number + max_distance).then_some((start, symbol))
557        })
558        .min_by_key(|(start, _)| *start)
559        .map(|(_, symbol)| symbol)
560}
561
562fn symbol_at_or_before(symbols: &[Symbol], line_number: u32) -> Option<&Symbol> {
563    symbols
564        .iter()
565        .filter_map(|symbol| {
566            let start = symbol.range.as_ref()?.start;
567            (start <= line_number).then_some((start, symbol))
568        })
569        .max_by_key(|(start, _)| *start)
570        .map(|(_, symbol)| symbol)
571}
572
573fn clean_java_type(value: &str) -> String {
574    value
575        .trim()
576        .trim_matches(',')
577        .split('<')
578        .next()
579        .unwrap_or(value)
580        .trim()
581        .to_string()
582}
583
584fn split_java_types(value: &str) -> Vec<String> {
585    value
586        .split(',')
587        .map(clean_java_type)
588        .filter(|value| !value.is_empty())
589        .collect()
590}
591
592fn spring_http_method(annotation: Option<&str>) -> &'static str {
593    match annotation {
594        Some("GetMapping") => "GET",
595        Some("PostMapping") => "POST",
596        Some("PutMapping") => "PUT",
597        Some("DeleteMapping") => "DELETE",
598        Some("PatchMapping") => "PATCH",
599        Some("RequestMapping") => "HTTP",
600        _ => "HTTP",
601    }
602}
603
604fn dedupe_analysis_facts(facts: &mut Vec<AnalysisFact>) {
605    let mut seen = HashSet::new();
606    facts.retain(|fact| seen.insert(fact.id.clone()));
607}
608
609pub fn extract_chunks(file: &File, content: &str, symbols: &[Symbol]) -> Vec<CodeChunk> {
610    if content.trim().is_empty() {
611        return Vec::new();
612    }
613    let lines = content.lines().collect::<Vec<_>>();
614    let mut chunks = Vec::new();
615    let mut starts = symbols
616        .iter()
617        .filter_map(|symbol| {
618            symbol
619                .range
620                .as_ref()
621                .map(|range| (range.start as usize, symbol.id.clone()))
622        })
623        .collect::<Vec<_>>();
624    starts.sort_by_key(|(line, _)| *line);
625    starts.dedup_by_key(|(line, _)| *line);
626    if starts.is_empty() {
627        for (idx, window) in lines.chunks(80).enumerate() {
628            let start = idx * 80 + 1;
629            let end = start + window.len().saturating_sub(1);
630            chunks.push(CodeChunk {
631                id: stable_id(&format!("{}:{start}:{end}", file.path.display())),
632                file_id: file.id.clone(),
633                range: LineRange {
634                    start: start as u32,
635                    end: end as u32,
636                },
637                language: file.language.clone(),
638                text: window.join("\n"),
639                symbol_id: None,
640            });
641        }
642        return chunks;
643    }
644    for (idx, (start, symbol_id)) in starts.iter().enumerate() {
645        let next = starts
646            .get(idx + 1)
647            .map(|(line, _)| *line)
648            .unwrap_or(lines.len() + 1);
649        let end = next.saturating_sub(1).min(lines.len());
650        let text = lines[start.saturating_sub(1)..end].join("\n");
651        chunks.push(CodeChunk {
652            id: stable_id(&format!("{}:{start}:{end}", file.path.display())),
653            file_id: file.id.clone(),
654            range: LineRange {
655                start: *start as u32,
656                end: end as u32,
657            },
658            language: file.language.clone(),
659            text,
660            symbol_id: Some(symbol_id.clone()),
661        });
662    }
663    chunks
664}
665
666pub fn extract_tests(
667    file: &File,
668    content: &str,
669    symbols: &[Symbol],
670    build_hint: Option<&str>,
671) -> Vec<TestTarget> {
672    let path = file.path.to_string_lossy().to_ascii_lowercase();
673    let is_test_file = path.contains("/test/")
674        || path.contains("/tests/")
675        || path.ends_with("_test.rs")
676        || path.ends_with("_test.go")
677        || path.ends_with("test.java")
678        || path.ends_with(".spec.ts")
679        || path.ends_with(".test.ts")
680        || path.ends_with("_test.py");
681
682    symbols
683        .iter()
684        .filter(|symbol| {
685            is_test_file
686                || symbol.name.starts_with("test")
687                || content
688                    .lines()
689                    .any(|line| line.contains("#[test]") || line.contains("@Test"))
690        })
691        .map(|symbol| TestTarget {
692            id: stable_id(&format!("test:{}:{}", file.path.display(), symbol.name)),
693            name: symbol.name.clone(),
694            file_id: file.id.clone(),
695            range: symbol.range.clone(),
696            command: recommended_command(&file.language, &file.path.to_string_lossy(), build_hint),
697            confidence: if is_test_file {
698                Confidence::High
699            } else {
700                Confidence::Medium
701            },
702            reason: "test-like path, annotation, or naming convention".into(),
703            evidence_refs: vec![stable_id(&format!(
704                "test:{}:{}",
705                file.path.display(),
706                symbol.name
707            ))],
708            score_breakdown: vec![ScoreComponent::single(
709                "indexed_test_confidence",
710                if is_test_file {
711                    Confidence::High.score()
712                } else {
713                    Confidence::Medium.score()
714                },
715                vec![stable_id(&format!(
716                    "test:{}:{}",
717                    file.path.display(),
718                    symbol.name
719                ))],
720                "test-like path, annotation, or naming convention",
721            )],
722        })
723        .collect()
724}
725
726fn qualified_name(file: &File, name: &str) -> String {
727    let stem = file
728        .path
729        .with_extension("")
730        .to_string_lossy()
731        .replace(['/', '\\'], "::");
732    format!("{stem}::{name}")
733}
734
735fn stable_id(value: &str) -> String {
736    let mut hasher = Sha256::new();
737    hasher.update(value.as_bytes());
738    format!("{:x}", hasher.finalize())
739}
740
741fn recommended_command(
742    language: &Language,
743    path: &str,
744    build_hint: Option<&str>,
745) -> Option<String> {
746    match (language, build_hint) {
747        (Language::Java, Some("gradle")) => Some("./gradlew test".into()),
748        (Language::Java, Some("bazel")) => Some("bazel test //...".into()),
749        (Language::Java, Some("maven") | _) => Some("mvn test".into()),
750        (Language::Rust, _) => Some("cargo test".into()),
751        (Language::TypeScript | Language::JavaScript, _) => Some("npm test".into()),
752        (Language::Python, _) => Some("pytest".into()),
753        (Language::Go, _) => Some("go test ./...".into()),
754        _ if path.contains("test") => Some("run repository test command".into()),
755        _ => None,
756    }
757}
758
759pub fn evidence_timestamp() -> chrono::DateTime<Utc> {
760    Utc::now()
761}
762
763#[cfg(test)]
764mod tests {
765    use super::{
766        extract_analysis_facts, extract_chunks, extract_imports, extract_symbols, extract_tests,
767    };
768    use open_kioku_core::{
769        Confidence, EvidenceSourceType, File, FileId, GraphEdgeType, GraphNodeType, Language,
770        LineRange, RepositoryId, Symbol, SymbolId, SymbolKind,
771    };
772
773    fn rust_file() -> File {
774        File {
775            id: FileId::new("file-rs"),
776            repository_id: RepositoryId::new("repo"),
777            path: "src/lib.rs".into(),
778            language: Language::Rust,
779            size_bytes: 0,
780            content_hash: "hash".into(),
781            is_generated: false,
782            is_vendor: false,
783        }
784    }
785
786    fn python_file() -> File {
787        File {
788            id: FileId::new("file-py"),
789            repository_id: RepositoryId::new("repo"),
790            path: "app/service.py".into(),
791            language: Language::Python,
792            size_bytes: 0,
793            content_hash: "hash".into(),
794            is_generated: false,
795            is_vendor: false,
796        }
797    }
798
799    fn ts_file() -> File {
800        File {
801            id: FileId::new("file-ts"),
802            repository_id: RepositoryId::new("repo"),
803            path: "src/index.ts".into(),
804            language: Language::TypeScript,
805            size_bytes: 0,
806            content_hash: "hash".into(),
807            is_generated: false,
808            is_vendor: false,
809        }
810    }
811
812    fn java_file() -> File {
813        File {
814            id: FileId::new("file-java"),
815            repository_id: RepositoryId::new("repo"),
816            path: "src/main/java/com/acme/OrderController.java".into(),
817            language: Language::Java,
818            size_bytes: 0,
819            content_hash: "hash".into(),
820            is_generated: false,
821            is_vendor: false,
822        }
823    }
824
825    // ─── extract_symbols ──────────────────────────────────────────────────────
826
827    #[test]
828    fn extracts_rust_functions_and_structs() {
829        let file = rust_file();
830        let src = "pub fn do_work() {}\npub struct Worker;\npub trait Runnable {}\nmod utils {}";
831        let symbols = extract_symbols(&file, src);
832        let names: Vec<_> = symbols.iter().map(|s| s.name.as_str()).collect();
833        assert!(names.contains(&"do_work"), "should find function");
834        assert!(names.contains(&"Worker"), "should find struct");
835        assert!(names.contains(&"Runnable"), "should find trait");
836        assert!(names.contains(&"utils"), "should find module");
837    }
838
839    #[test]
840    fn extracts_python_class_and_function() {
841        let file = python_file();
842        let src = "class MyService:\n    pass\n\ndef handle_request():\n    pass\n";
843        let symbols = extract_symbols(&file, src);
844        let names: Vec<_> = symbols.iter().map(|s| s.name.as_str()).collect();
845        assert!(names.contains(&"MyService"), "should find class");
846        assert!(names.contains(&"handle_request"), "should find function");
847    }
848
849    #[test]
850    fn extracts_typescript_class_and_function() {
851        let file = ts_file();
852        let src = "class ApiClient {}\nfunction fetchData() {}\nconst handler = () => {};";
853        let symbols = extract_symbols(&file, src);
854        let names: Vec<_> = symbols.iter().map(|s| s.name.as_str()).collect();
855        assert!(names.contains(&"ApiClient") || !symbols.is_empty());
856    }
857
858    // ─── extract_imports ──────────────────────────────────────────────────────
859
860    #[test]
861    fn extracts_rust_use_imports() {
862        let file = rust_file();
863        let src = "use std::collections::HashMap;\nuse crate::worker::Worker;";
864        let imports = extract_imports(&file, src);
865        assert_eq!(imports.len(), 2);
866        assert!(imports.iter().any(|i| i.imported.contains("HashMap")));
867    }
868
869    #[test]
870    fn extracts_python_imports() {
871        let file = python_file();
872        let src = "import os\nfrom pathlib import Path\n";
873        let imports = extract_imports(&file, src);
874        assert_eq!(imports.len(), 2);
875        assert!(imports.iter().any(|i| i.imported == "os"));
876        assert!(imports.iter().any(|i| i.imported == "pathlib"));
877    }
878
879    #[test]
880    fn extracts_typescript_imports() {
881        let file = ts_file();
882        let src = "import { foo } from './foo';\nimport './styles.css';";
883        let imports = extract_imports(&file, src);
884        assert!(!imports.is_empty());
885        assert!(imports.iter().any(|i| i.imported.contains("foo")));
886    }
887
888    #[test]
889    fn extracts_java_static_analysis_facts() {
890        let file = java_file();
891        let src = r#"
892class OrderController extends BaseController implements OrderApi, Audited {
893    @GetMapping("/orders/{id}")
894    public Order getOrder() {
895        System.getenv("ORDER_REGION");
896        return null;
897    }
898}
899"#;
900        let symbols = extract_symbols(&file, src);
901        let facts = extract_analysis_facts(&file, src, &symbols);
902        assert!(facts.iter().any(|fact| {
903            fact.edge_type == GraphEdgeType::Extends
904                && fact.target == "BaseController"
905                && fact.target_kind == GraphNodeType::Class
906        }));
907        assert!(facts.iter().any(|fact| {
908            fact.edge_type == GraphEdgeType::Implements
909                && fact.target == "OrderApi"
910                && fact.target_kind == GraphNodeType::Interface
911        }));
912        assert!(facts.iter().any(|fact| {
913            fact.edge_type == GraphEdgeType::ExposesEndpoint && fact.target == "GET /orders/{id}"
914        }));
915        assert!(facts.iter().any(|fact| {
916            fact.edge_type == GraphEdgeType::ReadsConfig && fact.target == "ORDER_REGION"
917        }));
918    }
919
920    #[test]
921    fn extracts_route_facts_for_script_languages() {
922        let ts = ts_file();
923        let ts_src = r#"router.post("/v1/orders", handler);"#;
924        let ts_facts = extract_analysis_facts(&ts, ts_src, &extract_symbols(&ts, ts_src));
925        assert!(ts_facts.iter().any(|fact| {
926            fact.edge_type == GraphEdgeType::ExposesEndpoint && fact.target == "POST /v1/orders"
927        }));
928
929        let py = python_file();
930        let py_src = "@app.get('/health')\ndef health():\n    return {}\n";
931        let py_facts = extract_analysis_facts(&py, py_src, &extract_symbols(&py, py_src));
932        assert!(py_facts.iter().any(|fact| {
933            fact.edge_type == GraphEdgeType::ExposesEndpoint && fact.target == "GET /health"
934        }));
935    }
936
937    // ─── extract_chunks ──────────────────────────────────────────────────────
938
939    #[test]
940    fn chunks_file_with_no_symbols_into_80_line_windows() {
941        let file = rust_file();
942        let content: String = (1..=200).map(|i| format!("line {i}\n")).collect();
943        let chunks = extract_chunks(&file, &content, &[]);
944        assert!(
945            chunks.len() >= 2,
946            "200 lines should produce at least 2 chunks"
947        );
948        for chunk in &chunks {
949            assert!(chunk.symbol_id.is_none());
950        }
951    }
952
953    #[test]
954    fn chunks_file_by_symbol_boundaries() {
955        let file = rust_file();
956        let src = "pub fn alpha() {}\npub fn beta() {}\npub fn gamma() {}";
957        let symbols = extract_symbols(&file, src);
958        assert!(
959            !symbols.is_empty(),
960            "should have symbols from heuristic parser"
961        );
962        let chunks = extract_chunks(&file, src, &symbols);
963        // Each symbol becomes a chunk boundary.
964        assert!(!chunks.is_empty());
965        assert!(chunks.iter().all(|c| c.symbol_id.is_some()));
966    }
967
968    #[test]
969    fn chunks_deduplicate_symbols_starting_on_same_line() {
970        let file = ts_file();
971        let src = "export const handler = () => call();\ncall();";
972        let symbols = vec![
973            Symbol {
974                id: SymbolId::new("handler"),
975                name: "handler".into(),
976                qualified_name: "src::index::handler".into(),
977                kind: SymbolKind::Function,
978                file_id: file.id.clone(),
979                range: Some(LineRange { start: 1, end: 1 }),
980                language: Language::TypeScript,
981                confidence: Confidence::High,
982                provenance: EvidenceSourceType::TreeSitter,
983            },
984            Symbol {
985                id: SymbolId::new("call"),
986                name: "call".into(),
987                qualified_name: "src::index::call".into(),
988                kind: SymbolKind::Function,
989                file_id: file.id.clone(),
990                range: Some(LineRange { start: 1, end: 1 }),
991                language: Language::TypeScript,
992                confidence: Confidence::High,
993                provenance: EvidenceSourceType::TreeSitter,
994            },
995        ];
996
997        let chunks = extract_chunks(&file, src, &symbols);
998
999        assert_eq!(chunks.len(), 1);
1000        assert_eq!(chunks[0].range.start, 1);
1001        assert_eq!(chunks[0].range.end, 2);
1002    }
1003
1004    // ─── extract_tests ────────────────────────────────────────────────────────
1005
1006    #[test]
1007    fn detects_rust_test_attribute() {
1008        let file = rust_file();
1009        let src = "#[test]\nfn it_works() {\n    assert!(true);\n}\n";
1010        let symbols = extract_symbols(&file, src);
1011        let tests = extract_tests(&file, src, &symbols, None);
1012        assert!(!tests.is_empty(), "should detect #[test] function");
1013        assert!(tests[0].command.as_deref() == Some("cargo test"));
1014    }
1015
1016    #[test]
1017    fn test_file_path_causes_all_symbols_to_be_tests() {
1018        let file = File {
1019            id: FileId::new("test-file"),
1020            repository_id: RepositoryId::new("repo"),
1021            path: "src/worker_test.rs".into(),
1022            language: Language::Rust,
1023            size_bytes: 0,
1024            content_hash: "hash".into(),
1025            is_generated: false,
1026            is_vendor: false,
1027        };
1028        let src = "pub fn some_helper() {}\n";
1029        let symbols = extract_symbols(&file, src);
1030        let tests = extract_tests(&file, src, &symbols, None);
1031        // All symbols in a test file become test targets.
1032        assert_eq!(tests.len(), symbols.len());
1033    }
1034}