1use chrono::Utc;
2use globset::{Glob, GlobSet, GlobSetBuilder};
3use ignore::WalkBuilder;
4use open_kioku_config::OkConfig;
5use open_kioku_core::{
6 AnalysisFact, CodeChunk, Confidence, EvidenceSourceType, File, FileId, GraphEdgeType,
7 GraphNodeType, Import, IndexManifest, IndexQuality, LineRange, Repository, RepositoryId,
8 Symbol, SymbolOccurrence, TestTarget,
9};
10use open_kioku_errors::{OkError, Result};
11use open_kioku_languages::{
12 detect_language, is_supported_code, likely_generated, likely_vendor_path,
13};
14use open_kioku_parse::{HeuristicParser, Parser};
15use open_kioku_scip::ScipIndexReport;
16use rayon::prelude::*;
17use serde_json::Value;
18use sha2::{Digest, Sha256};
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::path::Path;
22use std::sync::atomic::{AtomicUsize, Ordering};
23
24#[derive(Debug, Clone)]
25pub struct IndexSnapshot {
26 pub manifest: IndexManifest,
27 pub files: Vec<File>,
28 pub symbols: Vec<Symbol>,
29 pub chunks: Vec<CodeChunk>,
30 pub tests: Vec<TestTarget>,
31 pub imports: Vec<Import>,
32 pub occurrences: Vec<SymbolOccurrence>,
33 pub analysis_facts: Vec<AnalysisFact>,
34 pub scip: Option<ScipIndexReport>,
35}
36
37#[derive(Debug, Clone)]
38pub struct IndexProgress {
39 pub phase: &'static str,
40 pub scanned_files: usize,
41 pub indexed_files: usize,
42 pub total_files: Option<usize>,
43}
44
45pub struct Indexer {
46 parser: Box<dyn Parser>,
47}
48
49impl Default for Indexer {
50 fn default() -> Self {
51 Self {
52 parser: Box::<HeuristicParser>::default(),
53 }
54 }
55}
56
57impl Indexer {
58 pub fn index_repo(&self, root: impl AsRef<Path>, config: &OkConfig) -> Result<IndexSnapshot> {
59 self.index_repo_with_progress(root, config, |_| {})
60 }
61
62 pub fn index_repo_with_progress<F>(
63 &self,
64 root: impl AsRef<Path>,
65 config: &OkConfig,
66 on_progress: F,
67 ) -> Result<IndexSnapshot>
68 where
69 F: Fn(IndexProgress) + Sync,
70 {
71 let root = root.as_ref().canonicalize()?;
72 let repo_id = RepositoryId::new(stable_id(root.to_string_lossy().as_ref()));
73 let build_hint: Option<String> =
74 if root.join("build.gradle").exists() || root.join("build.gradle.kts").exists() {
75 Some("gradle".to_string())
76 } else if root.join("pom.xml").exists() {
77 Some("maven".to_string())
78 } else if root.join("WORKSPACE").exists()
79 || root.join("BUILD.bazel").exists()
80 || root.join("BUILD").exists()
81 {
82 Some("bazel".to_string())
83 } else {
84 None
85 };
86 let files = self.scan_files(&root, config, &repo_id, &on_progress)?;
87 on_progress(IndexProgress {
88 phase: "parse",
89 scanned_files: files.len(),
90 indexed_files: 0,
91 total_files: Some(files.len()),
92 });
93 let parsed_count = AtomicUsize::new(0);
94 let parsed = files
95 .par_iter()
96 .map(|file| -> Result<_> {
97 let bytes = fs::read(root.join(&file.path))?;
98 let content = String::from_utf8_lossy(&bytes).into_owned();
99 let parsed = self
100 .parser
101 .parse_with_hint(file, &content, build_hint.as_deref());
102 let indexed_files = parsed_count.fetch_add(1, Ordering::Relaxed) + 1;
103 if should_emit_progress(indexed_files, files.len()) {
104 on_progress(IndexProgress {
105 phase: "parse",
106 scanned_files: files.len(),
107 indexed_files,
108 total_files: Some(files.len()),
109 });
110 }
111 Ok(parsed)
112 })
113 .collect::<Result<Vec<_>>>()?;
114 on_progress(IndexProgress {
115 phase: "extract",
116 scanned_files: files.len(),
117 indexed_files: files.len(),
118 total_files: Some(files.len()),
119 });
120
121 let mut symbols = parsed
122 .iter()
123 .flat_map(|file| file.symbols.clone())
124 .collect::<Vec<_>>();
125 dedupe_symbols(&mut symbols);
126 let chunks = parsed
127 .iter()
128 .flat_map(|file| file.chunks.clone())
129 .collect::<Vec<_>>();
130 let tests = parsed
131 .iter()
132 .flat_map(|file| file.tests.clone())
133 .collect::<Vec<_>>();
134 let imports = parsed
135 .iter()
136 .flat_map(|file| file.imports.clone())
137 .collect::<Vec<_>>();
138 let mut analysis_facts = parsed
139 .iter()
140 .flat_map(|file| file.analysis_facts.clone())
141 .collect::<Vec<_>>();
142 let static_analysis_facts = analysis_facts.len();
143 on_progress(IndexProgress {
144 phase: "analysis",
145 scanned_files: files.len(),
146 indexed_files: files.len(),
147 total_files: Some(files.len()),
148 });
149 let runtime_facts = collect_runtime_analysis_facts(&root, &files)?;
150 let runtime_analysis_facts = runtime_facts.len();
151 analysis_facts.extend(runtime_facts);
152 let git_history_facts = if config.history.enabled {
153 collect_git_history_facts(
154 &root,
155 &files,
156 config.history.max_commits,
157 config.history.max_files_per_commit,
158 )?
159 } else {
160 Vec::new()
161 };
162 let git_history_fact_count = git_history_facts.len();
163 analysis_facts.extend(git_history_facts);
164 on_progress(IndexProgress {
165 phase: "occurrences",
166 scanned_files: files.len(),
167 indexed_files: files.len(),
168 total_files: Some(files.len()),
169 });
170 let mut occurrences = derive_occurrences(&chunks, &symbols);
171 let mut scip_report = None;
172 if config.scip.enabled {
173 on_progress(IndexProgress {
174 phase: "scip",
175 scanned_files: files.len(),
176 indexed_files: files.len(),
177 total_files: Some(files.len()),
178 });
179 let (imported, report) =
180 open_kioku_scip::prepare_and_import_scip(&root, &config.scip, &repo_id)?;
181 symbols.extend(imported.symbols);
182 dedupe_symbols(&mut symbols);
183 occurrences.extend(imported.occurrences);
184 scip_report = Some(report);
185 }
186 let repository = Repository {
187 id: repo_id,
188 name: config.repo.name.clone(),
189 root: root.clone(),
190 branch: open_kioku_git::branch(&root),
191 commit: open_kioku_git::commit(&root),
192 indexed_at: Some(Utc::now()),
193 };
194 let quality = index_quality(
195 &root,
196 config,
197 scip_report.as_ref(),
198 tests.len(),
199 imports.len(),
200 AnalysisCounts {
201 static_facts: static_analysis_facts,
202 runtime_facts: runtime_analysis_facts,
203 git_history_facts: git_history_fact_count,
204 },
205 );
206 let manifest = IndexManifest {
207 repository,
208 file_count: files.len(),
209 symbol_count: symbols.len(),
210 chunk_count: chunks.len(),
211 indexed_at: Utc::now(),
212 schema_version: 1,
213 quality,
214 };
215 Ok(IndexSnapshot {
216 manifest,
217 files,
218 symbols,
219 chunks,
220 tests,
221 imports,
222 occurrences,
223 analysis_facts,
224 scip: scip_report,
225 })
226 }
227
228 fn scan_files(
229 &self,
230 root: &Path,
231 config: &OkConfig,
232 repository_id: &RepositoryId,
233 on_progress: &dyn Fn(IndexProgress),
234 ) -> Result<Vec<File>> {
235 let max_size = config.max_file_size_bytes()?;
236 let excludes = compile_globs(&config.index.exclude)?;
237 let denied = compile_globs(&config.paths.deny)?;
238 let mut builder = WalkBuilder::new(root);
239 builder.hidden(!config.security.allow_hidden_files);
240 builder.git_ignore(true).git_exclude(true).parents(true);
241 let mut files = Vec::new();
242 let mut scanned_files = 0;
243 on_progress(IndexProgress {
244 phase: "scan",
245 scanned_files,
246 indexed_files: files.len(),
247 total_files: None,
248 });
249 for entry in builder.build() {
250 let entry = entry.map_err(|err| OkError::Index(err.to_string()))?;
251 if !entry
252 .file_type()
253 .map(|kind| kind.is_file())
254 .unwrap_or(false)
255 {
256 continue;
257 }
258 scanned_files += 1;
259 let path = entry.path();
260 let rel = path.strip_prefix(root).unwrap_or(path).to_path_buf();
261 if excludes.is_match(&rel) || denied.is_match(&rel) {
262 if should_emit_progress(scanned_files, 0) {
263 on_progress(IndexProgress {
264 phase: "scan",
265 scanned_files,
266 indexed_files: files.len(),
267 total_files: None,
268 });
269 }
270 continue;
271 }
272 let metadata = entry
273 .metadata()
274 .map_err(|err| OkError::Index(err.to_string()))?;
275 if metadata.len() > max_size {
276 continue;
277 }
278 let language = detect_language(&rel);
279 if !is_supported_code(&language) {
280 continue;
281 }
282 let bytes = fs::read(path)?;
283 if bytes.contains(&0) {
284 continue;
285 }
286 let content = String::from_utf8_lossy(&bytes);
287 let content_hash = hash_bytes(&bytes);
288 files.push(File {
289 id: FileId::new(stable_id(&rel.to_string_lossy())),
290 repository_id: repository_id.clone(),
291 path: rel.clone(),
292 language,
293 size_bytes: metadata.len(),
294 content_hash,
295 is_generated: likely_generated(&content),
296 is_vendor: likely_vendor_path(&rel),
297 });
298 if should_emit_progress(scanned_files, 0) {
299 on_progress(IndexProgress {
300 phase: "scan",
301 scanned_files,
302 indexed_files: files.len(),
303 total_files: None,
304 });
305 }
306 }
307 on_progress(IndexProgress {
308 phase: "scan",
309 scanned_files,
310 indexed_files: files.len(),
311 total_files: Some(files.len()),
312 });
313 Ok(files)
314 }
315}
316
317#[derive(Debug, Clone, Copy)]
318struct AnalysisCounts {
319 static_facts: usize,
320 runtime_facts: usize,
321 git_history_facts: usize,
322}
323
324fn index_quality(
325 root: &Path,
326 config: &OkConfig,
327 scip_report: Option<&ScipIndexReport>,
328 test_count: usize,
329 import_count: usize,
330 analysis: AnalysisCounts,
331) -> IndexQuality {
332 let mut quality_notes = Vec::new();
333 let build_systems = detect_build_systems(root);
334 let codeql_databases = detect_codeql_databases(root);
335 let coverage_reports = count_analysis_artifacts(root, &["jacoco.xml", "coverage.xml"]);
336 let junit_reports = count_analysis_artifacts(root, &["test-", "junit"]);
337 let mut semantic_provider_notes = Vec::new();
338 if !build_systems.is_empty() {
339 semantic_provider_notes.push(format!(
340 "build systems detected: {}",
341 build_systems.join(", ")
342 ));
343 }
344 if codeql_databases > 0 {
345 semantic_provider_notes.push(format!(
346 "CodeQL database artifacts detected: {codeql_databases}"
347 ));
348 }
349 if coverage_reports > 0 {
350 semantic_provider_notes.push(format!("coverage reports detected: {coverage_reports}"));
351 }
352 if junit_reports > 0 {
353 semantic_provider_notes.push(format!("JUnit-style reports detected: {junit_reports}"));
354 }
355 if analysis.static_facts > 0 {
356 semantic_provider_notes.push(format!(
357 "language static analysis facts detected: {}",
358 analysis.static_facts
359 ));
360 }
361 if analysis.runtime_facts > 0 {
362 semantic_provider_notes.push(format!(
363 "runtime analysis facts detected: {}",
364 analysis.runtime_facts
365 ));
366 }
367 if analysis.git_history_facts > 0 {
368 semantic_provider_notes.push(format!(
369 "git history co-change facts detected: {}",
370 analysis.git_history_facts
371 ));
372 }
373 let scip_mode = format!("{:?}", config.scip.mode).to_ascii_lowercase();
374 if let Some(report) = scip_report {
375 if report.imported_paths.is_empty() {
376 quality_notes.push("SCIP was enabled but no SCIP index was imported".into());
377 }
378 if report.exact_references == 0 {
379 quality_notes.push(
380 "exact reference coverage is unavailable; impact and test selection are heuristic"
381 .into(),
382 );
383 }
384 for attempt in &report.generator_attempts {
385 if !matches!(
386 attempt.status,
387 open_kioku_scip::ScipGeneratorStatus::Generated
388 | open_kioku_scip::ScipGeneratorStatus::Skipped
389 ) {
390 quality_notes.push(format!(
391 "SCIP {} generation {:?}: {}",
392 attempt.language, attempt.status, attempt.message
393 ));
394 }
395 }
396 IndexQuality {
397 scip_enabled: config.scip.enabled,
398 scip_mode,
399 scip_indexes_imported: report.imported_paths.len(),
400 scip_symbols: report.symbols,
401 scip_occurrences: report.occurrences,
402 scip_exact_references: report.exact_references,
403 test_count,
404 import_count,
405 build_systems,
406 codeql_databases,
407 coverage_reports,
408 junit_reports,
409 static_analysis_facts: analysis.static_facts,
410 runtime_analysis_facts: analysis.runtime_facts,
411 git_history_facts: analysis.git_history_facts,
412 semantic_provider_notes,
413 quality_notes,
414 }
415 } else {
416 if !config.scip.enabled {
417 quality_notes
418 .push("SCIP disabled; symbol references use tree-sitter/import heuristics".into());
419 }
420 IndexQuality {
421 scip_enabled: config.scip.enabled,
422 scip_mode,
423 scip_indexes_imported: 0,
424 scip_symbols: 0,
425 scip_occurrences: 0,
426 scip_exact_references: 0,
427 test_count,
428 import_count,
429 build_systems,
430 codeql_databases,
431 coverage_reports,
432 junit_reports,
433 static_analysis_facts: analysis.static_facts,
434 runtime_analysis_facts: analysis.runtime_facts,
435 git_history_facts: analysis.git_history_facts,
436 semantic_provider_notes,
437 quality_notes,
438 }
439 }
440}
441
442fn collect_git_history_facts(
443 root: &Path,
444 files: &[File],
445 max_commits: usize,
446 max_files_per_commit: usize,
447) -> Result<Vec<AnalysisFact>> {
448 let files_by_path = files
449 .iter()
450 .map(|file| (normalize_history_path(&file.path), file))
451 .collect::<HashMap<_, _>>();
452 let records = open_kioku_git::cochange_records(root, max_commits, max_files_per_commit)?;
453 let mut facts = Vec::new();
454 for record in records {
455 let Some(file) = files_by_path.get(&normalize_history_path(&record.path)) else {
456 continue;
457 };
458 if !files_by_path.contains_key(&normalize_history_path(&record.cochanged_path)) {
459 continue;
460 }
461 let id = stable_id(&format!(
462 "git-history:{}:{}",
463 record.path.display(),
464 record.cochanged_path.display()
465 ));
466 let mut message = format!(
467 "git co-change observed in {} commit(s), recency weight {:.2}",
468 record.commit_count, record.recency_weight
469 );
470 if record.test_corun {
471 message.push_str("; includes historical path-to-test co-run");
472 }
473 facts.push(AnalysisFact {
474 id,
475 file_id: file.id.clone(),
476 symbol_id: None,
477 target: normalize_history_path(&record.cochanged_path),
478 target_kind: if record.test_corun {
479 GraphNodeType::Test
480 } else {
481 GraphNodeType::File
482 },
483 edge_type: GraphEdgeType::ChangedBy,
484 range: None,
485 confidence: Confidence::from_score((0.45 + record.recency_weight / 4.0).min(0.90)),
486 source: format!("git-history:{}", record.commits.join(",")),
487 source_type: EvidenceSourceType::GitHistory,
488 message,
489 });
490 if facts.len() >= 5000 {
491 break;
492 }
493 }
494 Ok(dedupe_analysis_facts(facts))
495}
496
497fn detect_build_systems(root: &Path) -> Vec<String> {
498 let mut systems = Vec::new();
499 for (name, paths) in [
500 (
501 "gradle",
502 &[
503 "settings.gradle",
504 "settings.gradle.kts",
505 "build.gradle",
506 "build.gradle.kts",
507 ][..],
508 ),
509 ("maven", &["pom.xml"][..]),
510 (
511 "bazel",
512 &["WORKSPACE", "WORKSPACE.bazel", "MODULE.bazel"][..],
513 ),
514 ("cargo", &["Cargo.toml"][..]),
515 ("npm", &["package.json"][..]),
516 ("go", &["go.mod"][..]),
517 ] {
518 if paths.iter().any(|path| root.join(path).exists()) {
519 systems.push(name.to_string());
520 }
521 }
522 systems
523}
524
525fn detect_codeql_databases(root: &Path) -> usize {
526 [
527 ".ok/codeql",
528 "codeql-db",
529 "codeql-database",
530 ".codeql/database",
531 ]
532 .iter()
533 .filter(|path| {
534 let path = root.join(path);
535 path.is_dir()
536 && (path.join("db-java").exists()
537 || path.join("codeql-database.yml").exists()
538 || path.join("log").exists())
539 })
540 .count()
541}
542
543fn count_analysis_artifacts(root: &Path, names: &[&str]) -> usize {
544 let candidates = [
545 root.join(".ok/analysis"),
546 root.join("build/reports"),
547 root.join("target/site"),
548 root.join("coverage"),
549 ];
550 let mut count = 0;
551 for candidate in candidates {
552 if !candidate.is_dir() {
553 continue;
554 }
555 for entry in walkdir::WalkDir::new(candidate)
556 .max_depth(5)
557 .into_iter()
558 .filter_map(|entry| entry.ok())
559 {
560 if !entry.file_type().is_file() {
561 continue;
562 }
563 let file_name = entry.file_name().to_string_lossy().to_ascii_lowercase();
564 if names.iter().any(|needle| file_name.contains(needle)) {
565 count += 1;
566 }
567 }
568 }
569 count
570}
571
572fn collect_runtime_analysis_facts(root: &Path, files: &[File]) -> Result<Vec<AnalysisFact>> {
573 let files_by_path = files
574 .iter()
575 .map(|file| (normalize_path(&file.path.to_string_lossy()), file))
576 .collect::<HashMap<_, _>>();
577 let mut facts = Vec::new();
578 for runtime_root in [
579 root.join(".ok/runtime"),
580 root.join(".ok/analysis/runtime"),
581 root.join(".ok/analysis"),
582 ] {
583 if !runtime_root.is_dir() {
584 continue;
585 }
586 for entry in walkdir::WalkDir::new(&runtime_root)
587 .max_depth(3)
588 .into_iter()
589 .filter_map(|entry| entry.ok())
590 {
591 if !entry.file_type().is_file() {
592 continue;
593 }
594 let path = entry.path();
595 let Some(file_name) = path.file_name().and_then(|value| value.to_str()) else {
596 continue;
597 };
598 let lower_name = file_name.to_ascii_lowercase();
599 if !lower_name.ends_with(".jsonl")
600 || !(lower_name.contains("span")
601 || lower_name.contains("trace")
602 || lower_name.contains("runtime")
603 || lower_name.contains("otel")
604 || lower_name.contains("log")
605 || lower_name.contains("incident")
606 || lower_name.contains("error")
607 || lower_name.contains("failure"))
608 {
609 continue;
610 }
611 let metadata = entry
612 .metadata()
613 .map_err(|err| OkError::Index(err.to_string()))?;
614 if metadata.len() > 5 * 1024 * 1024 {
615 continue;
616 }
617 let content = fs::read_to_string(path)?;
618 for (idx, line) in content.lines().enumerate() {
619 if facts.len() >= 10_000 {
620 return Ok(dedupe_analysis_facts(facts));
621 }
622 let trimmed = line.trim();
623 if trimmed.is_empty() {
624 continue;
625 }
626 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
627 continue;
628 };
629 let Some(source_file) =
630 json_string(&value, &["file", "code.filepath", "source.file"])
631 else {
632 continue;
633 };
634 let normalized = normalize_runtime_file(root, &source_file);
635 let Some(file) = files_by_path.get(&normalized).copied() else {
636 continue;
637 };
638 let line_number = json_u32(&value, &["line", "code.lineno", "source.line"]);
639 if let Some(fact) = runtime_endpoint_fact(file, &value, line_number, path, idx + 1)
640 {
641 facts.push(fact);
642 }
643 if let Some(fact) = runtime_table_fact(file, &value, line_number, path, idx + 1) {
644 facts.push(fact);
645 }
646 if let Some(fact) = runtime_incident_fact(file, &value, line_number, path, idx + 1)
647 {
648 facts.push(fact);
649 }
650 }
651 }
652 }
653 Ok(dedupe_analysis_facts(facts))
654}
655
656fn runtime_endpoint_fact(
657 file: &File,
658 value: &Value,
659 line_number: Option<u32>,
660 artifact: &Path,
661 artifact_line: usize,
662) -> Option<AnalysisFact> {
663 let route = json_string(
664 value,
665 &[
666 "http.route",
667 "http.target",
668 "url.path",
669 "route",
670 "name",
671 "span.name",
672 ],
673 )?;
674 if !route.contains('/') {
675 return None;
676 }
677 let method = json_string(
678 value,
679 &[
680 "http.request.method",
681 "http.method",
682 "method",
683 "request.method",
684 ],
685 )
686 .unwrap_or_else(|| "HTTP".into())
687 .to_ascii_uppercase();
688 Some(runtime_fact(
689 file,
690 GraphEdgeType::ExposesEndpoint,
691 GraphNodeType::Endpoint,
692 format!("{method} {route}"),
693 line_number,
694 RuntimeFactSource {
695 artifact,
696 artifact_line,
697 message: "runtime endpoint observed in local trace artifact",
698 },
699 ))
700}
701
702fn runtime_table_fact(
703 file: &File,
704 value: &Value,
705 line_number: Option<u32>,
706 artifact: &Path,
707 artifact_line: usize,
708) -> Option<AnalysisFact> {
709 let statement = json_string(value, &["db.statement", "sql", "database.statement"])?;
710 let table = extract_sql_table(&statement)?;
711 Some(runtime_fact(
712 file,
713 GraphEdgeType::ReadsTable,
714 GraphNodeType::DatabaseTable,
715 table,
716 line_number,
717 RuntimeFactSource {
718 artifact,
719 artifact_line,
720 message: "runtime database access observed in local trace artifact",
721 },
722 ))
723}
724
725fn runtime_incident_fact(
726 file: &File,
727 value: &Value,
728 line_number: Option<u32>,
729 artifact: &Path,
730 artifact_line: usize,
731) -> Option<AnalysisFact> {
732 let message = json_string(
733 value,
734 &[
735 "error.message",
736 "exception.message",
737 "log.message",
738 "message",
739 "event.message",
740 "span.status.message",
741 "name",
742 "span.name",
743 ],
744 )?;
745 let signal = compact_runtime_message(&message)?;
746 Some(runtime_fact(
747 file,
748 GraphEdgeType::FailedIn,
749 GraphNodeType::RuntimeError,
750 signal,
751 line_number,
752 RuntimeFactSource {
753 artifact,
754 artifact_line,
755 message: "runtime incident observed in local log or failure artifact",
756 },
757 ))
758}
759
760struct RuntimeFactSource<'a> {
761 artifact: &'a Path,
762 artifact_line: usize,
763 message: &'static str,
764}
765
766fn runtime_fact(
767 file: &File,
768 edge_type: GraphEdgeType,
769 target_kind: GraphNodeType,
770 target: String,
771 line_number: Option<u32>,
772 source: RuntimeFactSource<'_>,
773) -> AnalysisFact {
774 AnalysisFact {
775 id: stable_id(&format!(
776 "runtime:{}:{:?}:{}:{}",
777 file.path.display(),
778 edge_type,
779 target,
780 source.artifact_line
781 )),
782 file_id: file.id.clone(),
783 symbol_id: None,
784 target,
785 target_kind,
786 edge_type,
787 range: line_number.map(LineRange::single),
788 confidence: Confidence::High,
789 source: format!("open-kioku-runtime:{}", source.artifact.display()),
790 source_type: EvidenceSourceType::Runtime,
791 message: source.message.into(),
792 }
793}
794
795fn json_string(value: &Value, keys: &[&str]) -> Option<String> {
796 for key in keys {
797 if let Some(value) = nested_json_value(value, key).and_then(Value::as_str) {
798 return Some(value.to_string());
799 }
800 if let Some(value) = value
801 .get("attributes")
802 .and_then(|attributes| nested_json_value(attributes, key))
803 .and_then(Value::as_str)
804 {
805 return Some(value.to_string());
806 }
807 if let Some(value) = value
808 .get("resource")
809 .and_then(|resource| resource.get("attributes"))
810 .and_then(|attributes| nested_json_value(attributes, key))
811 .and_then(Value::as_str)
812 {
813 return Some(value.to_string());
814 }
815 }
816 None
817}
818
819fn json_u32(value: &Value, keys: &[&str]) -> Option<u32> {
820 for key in keys {
821 if let Some(value) = nested_json_value(value, key)
822 .and_then(Value::as_u64)
823 .and_then(|value| u32::try_from(value).ok())
824 {
825 return Some(value);
826 }
827 if let Some(value) = value
828 .get("attributes")
829 .and_then(|attributes| nested_json_value(attributes, key))
830 .and_then(Value::as_u64)
831 .and_then(|value| u32::try_from(value).ok())
832 {
833 return Some(value);
834 }
835 }
836 None
837}
838
839fn nested_json_value<'a>(value: &'a Value, key: &str) -> Option<&'a Value> {
840 if let Some(exact) = value.get(key) {
841 return Some(exact);
842 }
843 let mut current = value;
844 for segment in key.split('.') {
845 current = current.get(segment)?;
846 }
847 Some(current)
848}
849
850fn normalize_runtime_file(root: &Path, value: &str) -> String {
851 let path = Path::new(value);
852 let rel = if path.is_absolute() {
853 path.strip_prefix(root).unwrap_or(path)
854 } else {
855 path
856 };
857 normalize_path(&rel.to_string_lossy())
858}
859
860fn normalize_path(value: &str) -> String {
861 value.trim_start_matches("./").replace('\\', "/")
862}
863
864fn normalize_history_path(path: &Path) -> String {
865 normalize_path(&path.to_string_lossy())
866}
867
868fn extract_sql_table(statement: &str) -> Option<String> {
869 let lower = statement.to_ascii_lowercase();
870 for keyword in [" from ", " join ", " update ", " into "] {
871 if let Some(index) = lower.find(keyword) {
872 let start = index + keyword.len();
873 let table = statement[start..]
874 .split(|ch: char| !ch.is_ascii_alphanumeric() && ch != '_' && ch != '.')
875 .find(|part| !part.is_empty())?;
876 return Some(table.to_string());
877 }
878 }
879 None
880}
881
882fn compact_runtime_message(message: &str) -> Option<String> {
883 let value = message.trim();
884 if value.is_empty() {
885 return None;
886 }
887 Some(value.chars().take(160).collect())
888}
889
890fn dedupe_analysis_facts(mut facts: Vec<AnalysisFact>) -> Vec<AnalysisFact> {
891 let mut seen = HashSet::new();
892 facts.retain(|fact| seen.insert(fact.id.clone()));
893 facts
894}
895
896fn should_emit_progress(done: usize, total: usize) -> bool {
897 done == total || done % 500 == 0
898}
899
900fn compile_globs(patterns: &[String]) -> Result<GlobSet> {
901 let mut builder = GlobSetBuilder::new();
902 for pattern in patterns {
903 builder.add(Glob::new(pattern).map_err(|err| OkError::Config(err.to_string()))?);
904 }
905 builder
906 .build()
907 .map_err(|err| OkError::Config(err.to_string()))
908}
909
910fn hash_bytes(bytes: &[u8]) -> String {
911 let mut hasher = Sha256::new();
912 hasher.update(bytes);
913 format!("{:x}", hasher.finalize())
914}
915
916fn stable_id(value: &str) -> String {
917 hash_bytes(value.as_bytes())
918}
919
920fn dedupe_symbols(symbols: &mut Vec<Symbol>) {
921 let mut seen = HashSet::new();
922 symbols.retain(|symbol| seen.insert(symbol.id.clone()));
923}
924
925fn derive_occurrences(_chunks: &[CodeChunk], symbols: &[Symbol]) -> Vec<SymbolOccurrence> {
926 let mut occurrences = symbols
927 .iter()
928 .map(|symbol| SymbolOccurrence {
929 symbol_id: symbol.id.clone(),
930 file_id: symbol.file_id.clone(),
931 range: symbol.range.clone(),
932 is_definition: true,
933 confidence: symbol.confidence,
934 provenance: symbol.provenance.clone(),
935 })
936 .collect::<Vec<_>>();
937 occurrences.sort_by(|a, b| {
938 (
939 &a.symbol_id.0,
940 &a.file_id.0,
941 a.range.as_ref().map(|r| r.start),
942 a.is_definition,
943 )
944 .cmp(&(
945 &b.symbol_id.0,
946 &b.file_id.0,
947 b.range.as_ref().map(|r| r.start),
948 b.is_definition,
949 ))
950 });
951 occurrences.dedup_by(|a, b| {
952 a.symbol_id == b.symbol_id
953 && a.file_id == b.file_id
954 && a.range == b.range
955 && a.is_definition == b.is_definition
956 });
957 occurrences
958}
959
960#[cfg(test)]
961mod tests {
962 use super::{derive_occurrences, Indexer};
963 use open_kioku_config::OkConfig;
964 use open_kioku_core::{
965 CodeChunk, Confidence, EvidenceSourceType, FileId, Language, LineRange, Symbol, SymbolId,
966 SymbolKind,
967 };
968 use std::process::Command;
969
970 fn symbol(id: &str, name: &str, line: u32) -> Symbol {
971 Symbol {
972 id: SymbolId::new(id),
973 name: name.into(),
974 qualified_name: format!("src::index::{name}"),
975 kind: SymbolKind::Function,
976 file_id: FileId::new(format!("file-{id}")),
977 range: Some(LineRange::single(line)),
978 language: Language::TypeScript,
979 confidence: Confidence::High,
980 provenance: EvidenceSourceType::TreeSitter,
981 }
982 }
983
984 #[test]
985 fn derive_occurrences_records_definitions_only_for_heuristic_indexing() {
986 let symbols = vec![symbol("retry", "retry", 1), symbol("render", "render", 2)];
987 let chunks = vec![CodeChunk {
988 id: "chunk".into(),
989 file_id: FileId::new("file-chunk"),
990 range: LineRange { start: 10, end: 12 },
991 language: Language::TypeScript,
992 text: "retry(); const retried = true;".into(),
993 symbol_id: None,
994 }];
995
996 let occurrences = derive_occurrences(&chunks, &symbols);
997 let definitions = occurrences
998 .iter()
999 .filter(|occurrence| occurrence.is_definition)
1000 .count();
1001 let references = occurrences
1002 .iter()
1003 .filter(|occurrence| !occurrence.is_definition)
1004 .count();
1005
1006 assert_eq!(definitions, 2);
1007 assert_eq!(references, 0);
1008 }
1009
1010 #[test]
1011 fn index_manifest_records_build_and_analysis_provider_signals() {
1012 let temp = tempfile::tempdir().unwrap();
1013 let root = temp.path();
1014 std::fs::write(root.join("settings.gradle"), "").unwrap();
1015 std::fs::create_dir_all(root.join("src/test/java/org/example")).unwrap();
1016 std::fs::write(
1017 root.join("src/test/java/org/example/ExampleTests.java"),
1018 r#"package org.example;
1019import org.springframework.web.bind.annotation.GetMapping;
1020class ExampleTests extends BaseTests {
1021 @GetMapping("/example")
1022 void works() {
1023 System.getenv("EXAMPLE_REGION");
1024 }
1025}
1026"#,
1027 )
1028 .unwrap();
1029 std::fs::create_dir_all(root.join(".ok/analysis")).unwrap();
1030 std::fs::write(root.join(".ok/analysis/jacoco.xml"), "<report/>").unwrap();
1031 std::fs::write(
1032 root.join(".ok/analysis/TEST-org.example.ExampleTests.xml"),
1033 "<testsuite/>",
1034 )
1035 .unwrap();
1036 std::fs::create_dir_all(root.join(".ok/runtime")).unwrap();
1037 std::fs::write(
1038 root.join(".ok/runtime/spans.jsonl"),
1039 r#"{"file":"src/test/java/org/example/ExampleTests.java","line":4,"attributes":{"http.route":"/example","http.request.method":"GET","db.statement":"select * from example_orders"}}"#,
1040 )
1041 .unwrap();
1042 std::fs::write(
1043 root.join(".ok/runtime/incidents.jsonl"),
1044 r#"{"file":"src/test/java/org/example/ExampleTests.java","line":5,"error.message":"checkout failure after runtime request"}"#,
1045 )
1046 .unwrap();
1047
1048 let mut config = OkConfig::default();
1049 config.scip.enabled = false;
1050 let snapshot = Indexer::default().index_repo(root, &config).unwrap();
1051
1052 assert!(snapshot
1053 .manifest
1054 .quality
1055 .build_systems
1056 .contains(&"gradle".to_string()));
1057 assert_eq!(snapshot.manifest.quality.coverage_reports, 1);
1058 assert_eq!(snapshot.manifest.quality.junit_reports, 1);
1059 assert!(snapshot.manifest.quality.static_analysis_facts >= 3);
1060 assert_eq!(snapshot.manifest.quality.runtime_analysis_facts, 3);
1061 assert!(snapshot
1062 .analysis_facts
1063 .iter()
1064 .any(|fact| fact.target == "GET /example"));
1065 assert!(snapshot
1066 .analysis_facts
1067 .iter()
1068 .any(|fact| fact.target == "example_orders"));
1069 assert!(snapshot
1070 .analysis_facts
1071 .iter()
1072 .any(|fact| fact.target == "checkout failure after runtime request"));
1073 assert!(snapshot
1074 .manifest
1075 .quality
1076 .semantic_provider_notes
1077 .iter()
1078 .any(|note| note.contains("build systems detected")));
1079 }
1080
1081 #[test]
1082 fn index_git_history_facts_can_be_disabled() {
1083 let temp = tempfile::tempdir().unwrap();
1084 let root = temp.path();
1085 git(root, &["init"]);
1086 git(root, &["config", "user.email", "test@example.com"]);
1087 git(root, &["config", "user.name", "Test User"]);
1088 std::fs::create_dir_all(root.join("src")).unwrap();
1089 std::fs::create_dir_all(root.join("tests")).unwrap();
1090 std::fs::write(root.join("src/auth.rs"), "pub fn login() {}\n").unwrap();
1091 std::fs::write(
1092 root.join("tests/auth_test.rs"),
1093 "#[test] fn login_test() {}\n",
1094 )
1095 .unwrap();
1096 git(root, &["add", "."]);
1097 git(root, &["commit", "-m", "auth with tests"]);
1098
1099 let mut enabled = OkConfig::default();
1100 enabled.scip.enabled = false;
1101 let snapshot = Indexer::default().index_repo(root, &enabled).unwrap();
1102 assert!(snapshot.manifest.quality.git_history_facts > 0);
1103 assert!(snapshot
1104 .analysis_facts
1105 .iter()
1106 .any(|fact| fact.source_type == EvidenceSourceType::GitHistory
1107 && fact.target == "tests/auth_test.rs"));
1108
1109 let mut disabled = enabled;
1110 disabled.history.enabled = false;
1111 let snapshot = Indexer::default().index_repo(root, &disabled).unwrap();
1112 assert_eq!(snapshot.manifest.quality.git_history_facts, 0);
1113 assert!(!snapshot
1114 .analysis_facts
1115 .iter()
1116 .any(|fact| fact.source_type == EvidenceSourceType::GitHistory));
1117 }
1118
1119 fn git(root: &std::path::Path, args: &[&str]) {
1120 let status = Command::new("git")
1121 .arg("-C")
1122 .arg(root)
1123 .args(args)
1124 .status()
1125 .unwrap();
1126 assert!(status.success(), "git {args:?} failed");
1127 }
1128}