1use chrono::Utc;
2use open_kioku_core::{
3 AnalysisFact, CodeChunk, Confidence, EvidenceSourceType, File, GraphEdgeType, GraphNodeType,
4 Import, Language, LineRange, ScoreComponent, Symbol, SymbolId, SymbolKind, TestTarget,
5};
6use regex::Regex;
7use sha2::{Digest, Sha256};
8use std::collections::HashSet;
9
10#[derive(Debug, Clone)]
11pub struct ParsedFile {
12 pub chunks: Vec<CodeChunk>,
13 pub symbols: Vec<Symbol>,
14 pub imports: Vec<Import>,
15 pub analysis_facts: Vec<AnalysisFact>,
16 pub tests: Vec<TestTarget>,
17}
18
19pub trait Parser: Send + Sync {
20 fn parse(&self, file: &File, content: &str) -> ParsedFile {
21 self.parse_with_hint(file, content, None)
22 }
23 fn parse_with_hint(&self, file: &File, content: &str, build_hint: Option<&str>) -> ParsedFile;
24}
25
26#[derive(Default)]
27pub struct HeuristicParser;
28
29impl Parser for HeuristicParser {
30 fn parse_with_hint(&self, file: &File, content: &str, build_hint: Option<&str>) -> ParsedFile {
31 let imports = extract_imports(file, content);
32 let mut symbols = extract_symbols(file, content);
33 dedupe_symbols(&mut symbols);
34 let analysis_facts = extract_analysis_facts(file, content, &symbols);
35 let mut chunks = extract_chunks(file, content, &symbols);
36 dedupe_chunks(&mut chunks);
37 let tests = extract_tests(file, content, &symbols, build_hint);
38 ParsedFile {
39 chunks,
40 symbols,
41 imports,
42 analysis_facts,
43 tests,
44 }
45 }
46}
47
48fn dedupe_symbols(symbols: &mut Vec<Symbol>) {
49 let mut seen = HashSet::new();
50 symbols.retain(|symbol| seen.insert(symbol.id.clone()));
51}
52
53fn dedupe_chunks(chunks: &mut Vec<CodeChunk>) {
54 let mut seen = HashSet::new();
55 chunks.retain(|chunk| seen.insert(chunk.id.clone()));
56}
57
58pub fn extract_symbols(file: &File, content: &str) -> Vec<Symbol> {
59 if let Ok(symbols) = open_kioku_tree_sitter::parse_symbols(file, content) {
60 if !symbols.is_empty() {
61 return symbols;
62 }
63 }
64 match file.language {
65 Language::Rust => extract_with_patterns(
66 file,
67 content,
68 &[
69 (
70 r"^\s*(pub\s+)?(async\s+)?fn\s+([A-Za-z_][A-Za-z0-9_]*)",
71 SymbolKind::Function,
72 3,
73 ),
74 (
75 r"^\s*(pub\s+)?struct\s+([A-Za-z_][A-Za-z0-9_]*)",
76 SymbolKind::Class,
77 2,
78 ),
79 (
80 r"^\s*(pub\s+)?enum\s+([A-Za-z_][A-Za-z0-9_]*)",
81 SymbolKind::Class,
82 2,
83 ),
84 (
85 r"^\s*(pub\s+)?trait\s+([A-Za-z_][A-Za-z0-9_]*)",
86 SymbolKind::Trait,
87 2,
88 ),
89 (r"^\s*mod\s+([A-Za-z_][A-Za-z0-9_]*)", SymbolKind::Module, 1),
90 ],
91 ),
92 Language::Java => extract_with_patterns(
93 file,
94 content,
95 &[
96 (
97 r"\b(class|record)\s+([A-Za-z_][A-Za-z0-9_]*)",
98 SymbolKind::Class,
99 2,
100 ),
101 (
102 r"\binterface\s+([A-Za-z_][A-Za-z0-9_]*)",
103 SymbolKind::Interface,
104 1,
105 ),
106 (
107 r"\b(?:public|private|protected)?\s*(?:static\s+)?[A-Za-z0-9_<>\[\], ?]+\s+([A-Za-z_][A-Za-z0-9_]*)\s*\(",
108 SymbolKind::Method,
109 1,
110 ),
111 ],
112 ),
113 Language::TypeScript | Language::JavaScript => extract_with_patterns(
114 file,
115 content,
116 &[
117 (
118 r"\bfunction\s+([A-Za-z_$][A-Za-z0-9_$]*)",
119 SymbolKind::Function,
120 1,
121 ),
122 (
123 r"\bclass\s+([A-Za-z_$][A-Za-z0-9_$]*)",
124 SymbolKind::Class,
125 1,
126 ),
127 (
128 r"\binterface\s+([A-Za-z_$][A-Za-z0-9_$]*)",
129 SymbolKind::Interface,
130 1,
131 ),
132 (
133 r"\b(?:const|let|var)\s+([A-Za-z_$][A-Za-z0-9_$]*)\s*=\s*(?:async\s*)?\(",
134 SymbolKind::Function,
135 1,
136 ),
137 (
138 r"\bexport\s+(?:const|let|var)\s+([A-Za-z_$][A-Za-z0-9_$]*)",
139 SymbolKind::Variable,
140 1,
141 ),
142 ],
143 ),
144 Language::Python => extract_with_patterns(
145 file,
146 content,
147 &[
148 (
149 r"^\s*def\s+([A-Za-z_][A-Za-z0-9_]*)",
150 SymbolKind::Function,
151 1,
152 ),
153 (
154 r"^\s*async\s+def\s+([A-Za-z_][A-Za-z0-9_]*)",
155 SymbolKind::Function,
156 1,
157 ),
158 (
159 r"^\s*class\s+([A-Za-z_][A-Za-z0-9_]*)",
160 SymbolKind::Class,
161 1,
162 ),
163 ],
164 ),
165 Language::Go => extract_with_patterns(
166 file,
167 content,
168 &[
169 (
170 r"^\s*func\s+(?:\([^)]+\)\s*)?([A-Za-z_][A-Za-z0-9_]*)",
171 SymbolKind::Function,
172 1,
173 ),
174 (
175 r"^\s*type\s+([A-Za-z_][A-Za-z0-9_]*)\s+struct",
176 SymbolKind::Class,
177 1,
178 ),
179 (
180 r"^\s*type\s+([A-Za-z_][A-Za-z0-9_]*)\s+interface",
181 SymbolKind::Interface,
182 1,
183 ),
184 ],
185 ),
186 Language::Sql => extract_with_patterns(
187 file,
188 content,
189 &[(
190 r"(?i)^\s*create\s+table\s+([A-Za-z_][A-Za-z0-9_\.]*)",
191 SymbolKind::DatabaseTable,
192 1,
193 )],
194 ),
195 _ => Vec::new(),
196 }
197}
198
199fn extract_with_patterns(
200 file: &File,
201 content: &str,
202 specs: &[(&str, SymbolKind, usize)],
203) -> Vec<Symbol> {
204 let compiled = specs
205 .iter()
206 .filter_map(|(pattern, kind, capture)| {
207 Regex::new(pattern)
208 .ok()
209 .map(|re| (re, kind.clone(), *capture))
210 })
211 .collect::<Vec<_>>();
212 let mut symbols = Vec::new();
213 for (idx, line) in content.lines().enumerate() {
214 for (regex, kind, capture) in &compiled {
215 if let Some(captures) = regex.captures(line) {
216 if let Some(name) = captures.get(*capture) {
217 let line_number = (idx + 1) as u32;
218 let qualified_name = qualified_name(file, name.as_str());
219 symbols.push(Symbol {
220 id: SymbolId::new(stable_id(&format!(
221 "{}:{}:{}",
222 file.path.display(),
223 line_number,
224 qualified_name
225 ))),
226 name: name.as_str().to_string(),
227 qualified_name,
228 kind: kind.clone(),
229 file_id: file.id.clone(),
230 range: Some(LineRange::single(line_number)),
231 language: file.language.clone(),
232 confidence: Confidence::Medium,
233 provenance: EvidenceSourceType::Heuristic,
234 });
235 }
236 }
237 }
238 }
239 symbols
240}
241
242pub fn extract_imports(file: &File, content: &str) -> Vec<Import> {
243 let patterns = match file.language {
244 Language::Rust => vec![r"^\s*use\s+([^;]+)", r"^\s*mod\s+([A-Za-z_][A-Za-z0-9_]*)"],
245 Language::Java => vec![r"^\s*import\s+([^;]+)"],
246 Language::TypeScript | Language::JavaScript => {
247 vec["']"#, r#"import\s+["']([^"']+)["']"#]
248 }
249 Language::Python => vec![
250 r"^\s*import\s+([A-Za-z0-9_\.]+)",
251 r"^\s*from\s+([A-Za-z0-9_\.]+)\s+import",
252 ],
253 Language::Go => vec![r#"^\s*import\s+"([^"]+)""#],
254 _ => Vec::new(),
255 };
256 let compiled = patterns
257 .iter()
258 .filter_map(|pattern| Regex::new(pattern).ok())
259 .collect::<Vec<_>>();
260 let mut imports = Vec::new();
261 for (idx, line) in content.lines().enumerate() {
262 for regex in &compiled {
263 if let Some(captures) = regex.captures(line) {
264 if let Some(value) = captures.get(1) {
265 imports.push(Import {
266 file_id: file.id.clone(),
267 imported: value.as_str().trim().to_string(),
268 range: Some(LineRange::single((idx + 1) as u32)),
269 confidence: Confidence::Medium,
270 });
271 }
272 }
273 }
274 }
275 imports
276}
277
278pub fn extract_analysis_facts(file: &File, content: &str, symbols: &[Symbol]) -> Vec<AnalysisFact> {
279 match file.language {
280 Language::Java => extract_java_analysis_facts(file, content, symbols),
281 Language::TypeScript | Language::JavaScript => {
282 extract_javascript_analysis_facts(file, content, symbols)
283 }
284 Language::Python => extract_python_analysis_facts(file, content, symbols),
285 Language::Rust => extract_rust_analysis_facts(file, content, symbols),
286 _ => Vec::new(),
287 }
288}
289
290fn extract_java_analysis_facts(
291 file: &File,
292 content: &str,
293 symbols: &[Symbol],
294) -> Vec<AnalysisFact> {
295 let mut facts = Vec::new();
296 let class_re = Regex::new(
297 r"\b(?:class|record|enum)\s+([A-Za-z_][A-Za-z0-9_]*)(?:\s+extends\s+([A-Za-z0-9_.$<>]+))?(?:\s+implements\s+([A-Za-z0-9_.$<>,\s]+))?",
298 )
299 .expect("valid Java class regex");
300 let interface_re = Regex::new(
301 r"\binterface\s+([A-Za-z_][A-Za-z0-9_]*)(?:\s+extends\s+([A-Za-z0-9_.$<>,\s]+))?",
302 )
303 .expect("valid Java interface regex");
304 let mapping_re = Regex::new(
305 r#"@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)(?:\s*\(\s*(?:value\s*=\s*)?["']([^"']+)["'])?"#,
306 )
307 .expect("valid Spring mapping regex");
308 let env_re =
309 Regex::new(r#"System\.getenv\(\s*["']([^"']+)["']\s*\)"#).expect("valid getenv regex");
310 let value_re = Regex::new(r#"@Value\(\s*["']\$\{([^}:]+)(?::[^}]*)?\}["']\s*\)"#)
311 .expect("valid Spring value regex");
312 let table_re =
313 Regex::new(r#"@Table\(\s*name\s*=\s*["']([^"']+)["']"#).expect("valid table regex");
314
315 for (idx, line) in content.lines().enumerate() {
316 let line_number = (idx + 1) as u32;
317 if let Some(captures) = class_re.captures(line) {
318 let source = captures.get(1).map(|value| value.as_str());
319 let source_symbol = source.and_then(|name| symbol_named(symbols, name));
320 if let Some(base) = captures.get(2) {
321 facts.push(analysis_fact(
322 file,
323 source_symbol,
324 GraphEdgeType::Extends,
325 GraphNodeType::Class,
326 clean_java_type(base.as_str()),
327 line_number,
328 ("open-kioku-static/java", "Java class inheritance"),
329 ));
330 }
331 if let Some(interfaces) = captures.get(3) {
332 for interface in split_java_types(interfaces.as_str()) {
333 facts.push(analysis_fact(
334 file,
335 source_symbol,
336 GraphEdgeType::Implements,
337 GraphNodeType::Interface,
338 interface,
339 line_number,
340 ("open-kioku-static/java", "Java implemented interface"),
341 ));
342 }
343 }
344 }
345 if let Some(captures) = interface_re.captures(line) {
346 let source = captures.get(1).map(|value| value.as_str());
347 let source_symbol = source.and_then(|name| symbol_named(symbols, name));
348 if let Some(parents) = captures.get(2) {
349 for parent in split_java_types(parents.as_str()) {
350 facts.push(analysis_fact(
351 file,
352 source_symbol,
353 GraphEdgeType::Extends,
354 GraphNodeType::Interface,
355 parent,
356 line_number,
357 ("open-kioku-static/java", "Java interface inheritance"),
358 ));
359 }
360 }
361 }
362 if let Some(captures) = mapping_re.captures(line) {
363 let method = spring_http_method(captures.get(1).map(|value| value.as_str()));
364 let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
365 let source_symbol = symbol_at_or_after(symbols, line_number, 4);
366 facts.push(analysis_fact(
367 file,
368 source_symbol,
369 GraphEdgeType::ExposesEndpoint,
370 GraphNodeType::Endpoint,
371 format!("{method} {route}"),
372 line_number,
373 ("open-kioku-static/java", "Spring MVC endpoint mapping"),
374 ));
375 }
376 for captures in env_re.captures_iter(line) {
377 if let Some(key) = captures.get(1) {
378 facts.push(analysis_fact(
379 file,
380 symbol_at_or_before(symbols, line_number),
381 GraphEdgeType::ReadsConfig,
382 GraphNodeType::ConfigKey,
383 key.as_str().to_string(),
384 line_number,
385 ("open-kioku-static/java", "Java environment variable read"),
386 ));
387 }
388 }
389 if let Some(captures) = value_re.captures(line) {
390 if let Some(key) = captures.get(1) {
391 facts.push(analysis_fact(
392 file,
393 symbol_at_or_after(symbols, line_number, 3),
394 GraphEdgeType::ReadsConfig,
395 GraphNodeType::ConfigKey,
396 key.as_str().to_string(),
397 line_number,
398 ("open-kioku-static/java", "Spring configuration value read"),
399 ));
400 }
401 }
402 if let Some(captures) = table_re.captures(line) {
403 if let Some(table) = captures.get(1) {
404 facts.push(analysis_fact(
405 file,
406 symbol_at_or_after(symbols, line_number, 3),
407 GraphEdgeType::ReadsTable,
408 GraphNodeType::DatabaseTable,
409 table.as_str().to_string(),
410 line_number,
411 ("open-kioku-static/java", "JPA table mapping"),
412 ));
413 }
414 }
415 }
416 dedupe_analysis_facts(&mut facts);
417 facts
418}
419
420fn extract_javascript_analysis_facts(
421 file: &File,
422 content: &str,
423 symbols: &[Symbol],
424) -> Vec<AnalysisFact> {
425 let mut facts = Vec::new();
426 let route_re =
427 Regex::new(r#"\b(?:app|router)\.(get|post|put|delete|patch|all)\(\s*["']([^"']+)["']"#)
428 .expect("valid JavaScript route regex");
429 for (idx, line) in content.lines().enumerate() {
430 let line_number = (idx + 1) as u32;
431 for captures in route_re.captures_iter(line) {
432 let method = captures
433 .get(1)
434 .map(|value| value.as_str().to_ascii_uppercase())
435 .unwrap_or_else(|| "HTTP".into());
436 let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
437 facts.push(analysis_fact(
438 file,
439 symbol_at_or_before(symbols, line_number),
440 GraphEdgeType::ExposesEndpoint,
441 GraphNodeType::Endpoint,
442 format!("{method} {route}"),
443 line_number,
444 ("open-kioku-static/javascript", "JavaScript HTTP route"),
445 ));
446 }
447 }
448 facts
449}
450
451fn extract_python_analysis_facts(
452 file: &File,
453 content: &str,
454 symbols: &[Symbol],
455) -> Vec<AnalysisFact> {
456 let mut facts = Vec::new();
457 let route_re = Regex::new(
458 r#"@(?:app|router|blueprint)\.(get|post|put|delete|patch|route)\(\s*["']([^"']+)["']"#,
459 )
460 .expect("valid Python route regex");
461 for (idx, line) in content.lines().enumerate() {
462 let line_number = (idx + 1) as u32;
463 for captures in route_re.captures_iter(line) {
464 let method = match captures.get(1).map(|value| value.as_str()) {
465 Some("route") => "HTTP".to_string(),
466 Some(value) => value.to_ascii_uppercase(),
467 None => "HTTP".into(),
468 };
469 let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
470 facts.push(analysis_fact(
471 file,
472 symbol_at_or_after(symbols, line_number, 2),
473 GraphEdgeType::ExposesEndpoint,
474 GraphNodeType::Endpoint,
475 format!("{method} {route}"),
476 line_number,
477 ("open-kioku-static/python", "Python HTTP route decorator"),
478 ));
479 }
480 }
481 facts
482}
483
484fn extract_rust_analysis_facts(
485 file: &File,
486 content: &str,
487 symbols: &[Symbol],
488) -> Vec<AnalysisFact> {
489 let mut facts = Vec::new();
490 let route_re = Regex::new(r#"#\[(get|post|put|delete|patch)\(\s*["']([^"']+)["']\s*\)\]"#)
491 .expect("valid Rust route regex");
492 for (idx, line) in content.lines().enumerate() {
493 let line_number = (idx + 1) as u32;
494 for captures in route_re.captures_iter(line) {
495 let method = captures
496 .get(1)
497 .map(|value| value.as_str().to_ascii_uppercase())
498 .unwrap_or_else(|| "HTTP".into());
499 let route = captures.get(2).map(|value| value.as_str()).unwrap_or("/");
500 facts.push(analysis_fact(
501 file,
502 symbol_at_or_after(symbols, line_number, 2),
503 GraphEdgeType::ExposesEndpoint,
504 GraphNodeType::Endpoint,
505 format!("{method} {route}"),
506 line_number,
507 ("open-kioku-static/rust", "Rust HTTP route attribute"),
508 ));
509 }
510 }
511 facts
512}
513
514fn analysis_fact(
515 file: &File,
516 symbol: Option<&Symbol>,
517 edge_type: GraphEdgeType,
518 target_kind: GraphNodeType,
519 target: String,
520 line_number: u32,
521 source: (&str, &str),
522) -> AnalysisFact {
523 AnalysisFact {
524 id: stable_id(&format!(
525 "analysis:{}:{}:{:?}:{}:{}",
526 file.path.display(),
527 symbol
528 .map(|symbol| symbol.id.0.as_str())
529 .unwrap_or("<file>"),
530 edge_type,
531 target,
532 line_number
533 )),
534 file_id: file.id.clone(),
535 symbol_id: symbol.map(|symbol| symbol.id.clone()),
536 target,
537 target_kind,
538 edge_type,
539 range: Some(LineRange::single(line_number)),
540 confidence: Confidence::Medium,
541 source: source.0.into(),
542 source_type: EvidenceSourceType::StaticAnalysis,
543 message: source.1.into(),
544 }
545}
546
547fn symbol_named<'a>(symbols: &'a [Symbol], name: &str) -> Option<&'a Symbol> {
548 symbols.iter().find(|symbol| symbol.name == name)
549}
550
551fn symbol_at_or_after(symbols: &[Symbol], line_number: u32, max_distance: u32) -> Option<&Symbol> {
552 symbols
553 .iter()
554 .filter_map(|symbol| {
555 let start = symbol.range.as_ref()?.start;
556 (start >= line_number && start <= line_number + max_distance).then_some((start, symbol))
557 })
558 .min_by_key(|(start, _)| *start)
559 .map(|(_, symbol)| symbol)
560}
561
562fn symbol_at_or_before(symbols: &[Symbol], line_number: u32) -> Option<&Symbol> {
563 symbols
564 .iter()
565 .filter_map(|symbol| {
566 let start = symbol.range.as_ref()?.start;
567 (start <= line_number).then_some((start, symbol))
568 })
569 .max_by_key(|(start, _)| *start)
570 .map(|(_, symbol)| symbol)
571}
572
573fn clean_java_type(value: &str) -> String {
574 value
575 .trim()
576 .trim_matches(',')
577 .split('<')
578 .next()
579 .unwrap_or(value)
580 .trim()
581 .to_string()
582}
583
584fn split_java_types(value: &str) -> Vec<String> {
585 value
586 .split(',')
587 .map(clean_java_type)
588 .filter(|value| !value.is_empty())
589 .collect()
590}
591
592fn spring_http_method(annotation: Option<&str>) -> &'static str {
593 match annotation {
594 Some("GetMapping") => "GET",
595 Some("PostMapping") => "POST",
596 Some("PutMapping") => "PUT",
597 Some("DeleteMapping") => "DELETE",
598 Some("PatchMapping") => "PATCH",
599 Some("RequestMapping") => "HTTP",
600 _ => "HTTP",
601 }
602}
603
604fn dedupe_analysis_facts(facts: &mut Vec<AnalysisFact>) {
605 let mut seen = HashSet::new();
606 facts.retain(|fact| seen.insert(fact.id.clone()));
607}
608
609pub fn extract_chunks(file: &File, content: &str, symbols: &[Symbol]) -> Vec<CodeChunk> {
610 if content.trim().is_empty() {
611 return Vec::new();
612 }
613 let lines = content.lines().collect::<Vec<_>>();
614 let mut chunks = Vec::new();
615 let mut starts = symbols
616 .iter()
617 .filter_map(|symbol| {
618 symbol
619 .range
620 .as_ref()
621 .map(|range| (range.start as usize, symbol.id.clone()))
622 })
623 .collect::<Vec<_>>();
624 starts.sort_by_key(|(line, _)| *line);
625 starts.dedup_by_key(|(line, _)| *line);
626 if starts.is_empty() {
627 for (idx, window) in lines.chunks(80).enumerate() {
628 let start = idx * 80 + 1;
629 let end = start + window.len().saturating_sub(1);
630 chunks.push(CodeChunk {
631 id: stable_id(&format!("{}:{start}:{end}", file.path.display())),
632 file_id: file.id.clone(),
633 range: LineRange {
634 start: start as u32,
635 end: end as u32,
636 },
637 language: file.language.clone(),
638 text: window.join("\n"),
639 symbol_id: None,
640 });
641 }
642 return chunks;
643 }
644 for (idx, (start, symbol_id)) in starts.iter().enumerate() {
645 let next = starts
646 .get(idx + 1)
647 .map(|(line, _)| *line)
648 .unwrap_or(lines.len() + 1);
649 let end = next.saturating_sub(1).min(lines.len());
650 let text = lines[start.saturating_sub(1)..end].join("\n");
651 chunks.push(CodeChunk {
652 id: stable_id(&format!("{}:{start}:{end}", file.path.display())),
653 file_id: file.id.clone(),
654 range: LineRange {
655 start: *start as u32,
656 end: end as u32,
657 },
658 language: file.language.clone(),
659 text,
660 symbol_id: Some(symbol_id.clone()),
661 });
662 }
663 chunks
664}
665
666pub fn extract_tests(
667 file: &File,
668 content: &str,
669 symbols: &[Symbol],
670 build_hint: Option<&str>,
671) -> Vec<TestTarget> {
672 let path = file.path.to_string_lossy().to_ascii_lowercase();
673 let is_test_file = path.contains("/test/")
674 || path.contains("/tests/")
675 || path.ends_with("_test.rs")
676 || path.ends_with("_test.go")
677 || path.ends_with("test.java")
678 || path.ends_with(".spec.ts")
679 || path.ends_with(".test.ts")
680 || path.ends_with("_test.py");
681
682 symbols
683 .iter()
684 .filter(|symbol| {
685 is_test_file
686 || symbol.name.starts_with("test")
687 || content
688 .lines()
689 .any(|line| line.contains("#[test]") || line.contains("@Test"))
690 })
691 .map(|symbol| TestTarget {
692 id: stable_id(&format!("test:{}:{}", file.path.display(), symbol.name)),
693 name: symbol.name.clone(),
694 file_id: file.id.clone(),
695 range: symbol.range.clone(),
696 command: recommended_command(&file.language, &file.path.to_string_lossy(), build_hint),
697 confidence: if is_test_file {
698 Confidence::High
699 } else {
700 Confidence::Medium
701 },
702 reason: "test-like path, annotation, or naming convention".into(),
703 evidence_refs: vec![stable_id(&format!(
704 "test:{}:{}",
705 file.path.display(),
706 symbol.name
707 ))],
708 score_breakdown: vec![ScoreComponent::single(
709 "indexed_test_confidence",
710 if is_test_file {
711 Confidence::High.score()
712 } else {
713 Confidence::Medium.score()
714 },
715 vec![stable_id(&format!(
716 "test:{}:{}",
717 file.path.display(),
718 symbol.name
719 ))],
720 "test-like path, annotation, or naming convention",
721 )],
722 })
723 .collect()
724}
725
726fn qualified_name(file: &File, name: &str) -> String {
727 let stem = file
728 .path
729 .with_extension("")
730 .to_string_lossy()
731 .replace(['/', '\\'], "::");
732 format!("{stem}::{name}")
733}
734
735fn stable_id(value: &str) -> String {
736 let mut hasher = Sha256::new();
737 hasher.update(value.as_bytes());
738 format!("{:x}", hasher.finalize())
739}
740
741fn recommended_command(
742 language: &Language,
743 path: &str,
744 build_hint: Option<&str>,
745) -> Option<String> {
746 match (language, build_hint) {
747 (Language::Java, Some("gradle")) => Some("./gradlew test".into()),
748 (Language::Java, Some("bazel")) => Some("bazel test //...".into()),
749 (Language::Java, Some("maven") | _) => Some("mvn test".into()),
750 (Language::Rust, _) => Some("cargo test".into()),
751 (Language::TypeScript | Language::JavaScript, _) => Some("npm test".into()),
752 (Language::Python, _) => Some("pytest".into()),
753 (Language::Go, _) => Some("go test ./...".into()),
754 _ if path.contains("test") => Some("run repository test command".into()),
755 _ => None,
756 }
757}
758
759pub fn evidence_timestamp() -> chrono::DateTime<Utc> {
760 Utc::now()
761}
762
763#[cfg(test)]
764mod tests {
765 use super::{
766 extract_analysis_facts, extract_chunks, extract_imports, extract_symbols, extract_tests,
767 };
768 use open_kioku_core::{
769 Confidence, EvidenceSourceType, File, FileId, GraphEdgeType, GraphNodeType, Language,
770 LineRange, RepositoryId, Symbol, SymbolId, SymbolKind,
771 };
772
773 fn rust_file() -> File {
774 File {
775 id: FileId::new("file-rs"),
776 repository_id: RepositoryId::new("repo"),
777 path: "src/lib.rs".into(),
778 language: Language::Rust,
779 size_bytes: 0,
780 content_hash: "hash".into(),
781 is_generated: false,
782 is_vendor: false,
783 }
784 }
785
786 fn python_file() -> File {
787 File {
788 id: FileId::new("file-py"),
789 repository_id: RepositoryId::new("repo"),
790 path: "app/service.py".into(),
791 language: Language::Python,
792 size_bytes: 0,
793 content_hash: "hash".into(),
794 is_generated: false,
795 is_vendor: false,
796 }
797 }
798
799 fn ts_file() -> File {
800 File {
801 id: FileId::new("file-ts"),
802 repository_id: RepositoryId::new("repo"),
803 path: "src/index.ts".into(),
804 language: Language::TypeScript,
805 size_bytes: 0,
806 content_hash: "hash".into(),
807 is_generated: false,
808 is_vendor: false,
809 }
810 }
811
812 fn java_file() -> File {
813 File {
814 id: FileId::new("file-java"),
815 repository_id: RepositoryId::new("repo"),
816 path: "src/main/java/com/acme/OrderController.java".into(),
817 language: Language::Java,
818 size_bytes: 0,
819 content_hash: "hash".into(),
820 is_generated: false,
821 is_vendor: false,
822 }
823 }
824
825 #[test]
828 fn extracts_rust_functions_and_structs() {
829 let file = rust_file();
830 let src = "pub fn do_work() {}\npub struct Worker;\npub trait Runnable {}\nmod utils {}";
831 let symbols = extract_symbols(&file, src);
832 let names: Vec<_> = symbols.iter().map(|s| s.name.as_str()).collect();
833 assert!(names.contains(&"do_work"), "should find function");
834 assert!(names.contains(&"Worker"), "should find struct");
835 assert!(names.contains(&"Runnable"), "should find trait");
836 assert!(names.contains(&"utils"), "should find module");
837 }
838
839 #[test]
840 fn extracts_python_class_and_function() {
841 let file = python_file();
842 let src = "class MyService:\n pass\n\ndef handle_request():\n pass\n";
843 let symbols = extract_symbols(&file, src);
844 let names: Vec<_> = symbols.iter().map(|s| s.name.as_str()).collect();
845 assert!(names.contains(&"MyService"), "should find class");
846 assert!(names.contains(&"handle_request"), "should find function");
847 }
848
849 #[test]
850 fn extracts_typescript_class_and_function() {
851 let file = ts_file();
852 let src = "class ApiClient {}\nfunction fetchData() {}\nconst handler = () => {};";
853 let symbols = extract_symbols(&file, src);
854 let names: Vec<_> = symbols.iter().map(|s| s.name.as_str()).collect();
855 assert!(names.contains(&"ApiClient") || !symbols.is_empty());
856 }
857
858 #[test]
861 fn extracts_rust_use_imports() {
862 let file = rust_file();
863 let src = "use std::collections::HashMap;\nuse crate::worker::Worker;";
864 let imports = extract_imports(&file, src);
865 assert_eq!(imports.len(), 2);
866 assert!(imports.iter().any(|i| i.imported.contains("HashMap")));
867 }
868
869 #[test]
870 fn extracts_python_imports() {
871 let file = python_file();
872 let src = "import os\nfrom pathlib import Path\n";
873 let imports = extract_imports(&file, src);
874 assert_eq!(imports.len(), 2);
875 assert!(imports.iter().any(|i| i.imported == "os"));
876 assert!(imports.iter().any(|i| i.imported == "pathlib"));
877 }
878
879 #[test]
880 fn extracts_typescript_imports() {
881 let file = ts_file();
882 let src = "import { foo } from './foo';\nimport './styles.css';";
883 let imports = extract_imports(&file, src);
884 assert!(!imports.is_empty());
885 assert!(imports.iter().any(|i| i.imported.contains("foo")));
886 }
887
888 #[test]
889 fn extracts_java_static_analysis_facts() {
890 let file = java_file();
891 let src = r#"
892class OrderController extends BaseController implements OrderApi, Audited {
893 @GetMapping("/orders/{id}")
894 public Order getOrder() {
895 System.getenv("ORDER_REGION");
896 return null;
897 }
898}
899"#;
900 let symbols = extract_symbols(&file, src);
901 let facts = extract_analysis_facts(&file, src, &symbols);
902 assert!(facts.iter().any(|fact| {
903 fact.edge_type == GraphEdgeType::Extends
904 && fact.target == "BaseController"
905 && fact.target_kind == GraphNodeType::Class
906 }));
907 assert!(facts.iter().any(|fact| {
908 fact.edge_type == GraphEdgeType::Implements
909 && fact.target == "OrderApi"
910 && fact.target_kind == GraphNodeType::Interface
911 }));
912 assert!(facts.iter().any(|fact| {
913 fact.edge_type == GraphEdgeType::ExposesEndpoint && fact.target == "GET /orders/{id}"
914 }));
915 assert!(facts.iter().any(|fact| {
916 fact.edge_type == GraphEdgeType::ReadsConfig && fact.target == "ORDER_REGION"
917 }));
918 }
919
920 #[test]
921 fn extracts_route_facts_for_script_languages() {
922 let ts = ts_file();
923 let ts_src = r#"router.post("/v1/orders", handler);"#;
924 let ts_facts = extract_analysis_facts(&ts, ts_src, &extract_symbols(&ts, ts_src));
925 assert!(ts_facts.iter().any(|fact| {
926 fact.edge_type == GraphEdgeType::ExposesEndpoint && fact.target == "POST /v1/orders"
927 }));
928
929 let py = python_file();
930 let py_src = "@app.get('/health')\ndef health():\n return {}\n";
931 let py_facts = extract_analysis_facts(&py, py_src, &extract_symbols(&py, py_src));
932 assert!(py_facts.iter().any(|fact| {
933 fact.edge_type == GraphEdgeType::ExposesEndpoint && fact.target == "GET /health"
934 }));
935 }
936
937 #[test]
940 fn chunks_file_with_no_symbols_into_80_line_windows() {
941 let file = rust_file();
942 let content: String = (1..=200).map(|i| format!("line {i}\n")).collect();
943 let chunks = extract_chunks(&file, &content, &[]);
944 assert!(
945 chunks.len() >= 2,
946 "200 lines should produce at least 2 chunks"
947 );
948 for chunk in &chunks {
949 assert!(chunk.symbol_id.is_none());
950 }
951 }
952
953 #[test]
954 fn chunks_file_by_symbol_boundaries() {
955 let file = rust_file();
956 let src = "pub fn alpha() {}\npub fn beta() {}\npub fn gamma() {}";
957 let symbols = extract_symbols(&file, src);
958 assert!(
959 !symbols.is_empty(),
960 "should have symbols from heuristic parser"
961 );
962 let chunks = extract_chunks(&file, src, &symbols);
963 assert!(!chunks.is_empty());
965 assert!(chunks.iter().all(|c| c.symbol_id.is_some()));
966 }
967
968 #[test]
969 fn chunks_deduplicate_symbols_starting_on_same_line() {
970 let file = ts_file();
971 let src = "export const handler = () => call();\ncall();";
972 let symbols = vec![
973 Symbol {
974 id: SymbolId::new("handler"),
975 name: "handler".into(),
976 qualified_name: "src::index::handler".into(),
977 kind: SymbolKind::Function,
978 file_id: file.id.clone(),
979 range: Some(LineRange { start: 1, end: 1 }),
980 language: Language::TypeScript,
981 confidence: Confidence::High,
982 provenance: EvidenceSourceType::TreeSitter,
983 },
984 Symbol {
985 id: SymbolId::new("call"),
986 name: "call".into(),
987 qualified_name: "src::index::call".into(),
988 kind: SymbolKind::Function,
989 file_id: file.id.clone(),
990 range: Some(LineRange { start: 1, end: 1 }),
991 language: Language::TypeScript,
992 confidence: Confidence::High,
993 provenance: EvidenceSourceType::TreeSitter,
994 },
995 ];
996
997 let chunks = extract_chunks(&file, src, &symbols);
998
999 assert_eq!(chunks.len(), 1);
1000 assert_eq!(chunks[0].range.start, 1);
1001 assert_eq!(chunks[0].range.end, 2);
1002 }
1003
1004 #[test]
1007 fn detects_rust_test_attribute() {
1008 let file = rust_file();
1009 let src = "#[test]\nfn it_works() {\n assert!(true);\n}\n";
1010 let symbols = extract_symbols(&file, src);
1011 let tests = extract_tests(&file, src, &symbols, None);
1012 assert!(!tests.is_empty(), "should detect #[test] function");
1013 assert!(tests[0].command.as_deref() == Some("cargo test"));
1014 }
1015
1016 #[test]
1017 fn test_file_path_causes_all_symbols_to_be_tests() {
1018 let file = File {
1019 id: FileId::new("test-file"),
1020 repository_id: RepositoryId::new("repo"),
1021 path: "src/worker_test.rs".into(),
1022 language: Language::Rust,
1023 size_bytes: 0,
1024 content_hash: "hash".into(),
1025 is_generated: false,
1026 is_vendor: false,
1027 };
1028 let src = "pub fn some_helper() {}\n";
1029 let symbols = extract_symbols(&file, src);
1030 let tests = extract_tests(&file, src, &symbols, None);
1031 assert_eq!(tests.len(), symbols.len());
1033 }
1034}