Skip to main content

open_kioku_ingest/
lib.rs

1use chrono::Utc;
2use globset::{Glob, GlobSet, GlobSetBuilder};
3use ignore::WalkBuilder;
4use open_kioku_config::OkConfig;
5use open_kioku_core::{
6    AnalysisFact, CodeChunk, Confidence, EvidenceSourceType, File, FileId, GraphEdgeType,
7    GraphNodeType, Import, IndexManifest, IndexQuality, LineRange, Repository, RepositoryId,
8    Symbol, SymbolOccurrence, TestTarget,
9};
10use open_kioku_errors::{OkError, Result};
11use open_kioku_languages::{
12    detect_language, is_supported_code, likely_generated, likely_vendor_path,
13};
14use open_kioku_parse::{HeuristicParser, Parser};
15use open_kioku_scip::ScipIndexReport;
16use rayon::prelude::*;
17use serde_json::Value;
18use sha2::{Digest, Sha256};
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::path::Path;
22use std::sync::atomic::{AtomicUsize, Ordering};
23
24#[derive(Debug, Clone)]
25pub struct IndexSnapshot {
26    pub manifest: IndexManifest,
27    pub files: Vec<File>,
28    pub symbols: Vec<Symbol>,
29    pub chunks: Vec<CodeChunk>,
30    pub tests: Vec<TestTarget>,
31    pub imports: Vec<Import>,
32    pub occurrences: Vec<SymbolOccurrence>,
33    pub analysis_facts: Vec<AnalysisFact>,
34    pub scip: Option<ScipIndexReport>,
35}
36
37#[derive(Debug, Clone)]
38pub struct IndexProgress {
39    pub phase: &'static str,
40    pub scanned_files: usize,
41    pub indexed_files: usize,
42    pub total_files: Option<usize>,
43}
44
45pub struct Indexer {
46    parser: Box<dyn Parser>,
47}
48
49impl Default for Indexer {
50    fn default() -> Self {
51        Self {
52            parser: Box::<HeuristicParser>::default(),
53        }
54    }
55}
56
57impl Indexer {
58    pub fn index_repo(&self, root: impl AsRef<Path>, config: &OkConfig) -> Result<IndexSnapshot> {
59        self.index_repo_with_progress(root, config, |_| {})
60    }
61
62    pub fn index_repo_with_progress<F>(
63        &self,
64        root: impl AsRef<Path>,
65        config: &OkConfig,
66        on_progress: F,
67    ) -> Result<IndexSnapshot>
68    where
69        F: Fn(IndexProgress) + Sync,
70    {
71        let root = root.as_ref().canonicalize()?;
72        let repo_id = RepositoryId::new(stable_id(root.to_string_lossy().as_ref()));
73        let build_hint: Option<String> =
74            if root.join("build.gradle").exists() || root.join("build.gradle.kts").exists() {
75                Some("gradle".to_string())
76            } else if root.join("pom.xml").exists() {
77                Some("maven".to_string())
78            } else if root.join("WORKSPACE").exists()
79                || root.join("BUILD.bazel").exists()
80                || root.join("BUILD").exists()
81            {
82                Some("bazel".to_string())
83            } else {
84                None
85            };
86        let files = self.scan_files(&root, config, &repo_id, &on_progress)?;
87        on_progress(IndexProgress {
88            phase: "parse",
89            scanned_files: files.len(),
90            indexed_files: 0,
91            total_files: Some(files.len()),
92        });
93        let parsed_count = AtomicUsize::new(0);
94        let parsed = files
95            .par_iter()
96            .map(|file| -> Result<_> {
97                let bytes = fs::read(root.join(&file.path))?;
98                let content = String::from_utf8_lossy(&bytes).into_owned();
99                let parsed = self
100                    .parser
101                    .parse_with_hint(file, &content, build_hint.as_deref());
102                let indexed_files = parsed_count.fetch_add(1, Ordering::Relaxed) + 1;
103                if should_emit_progress(indexed_files, files.len()) {
104                    on_progress(IndexProgress {
105                        phase: "parse",
106                        scanned_files: files.len(),
107                        indexed_files,
108                        total_files: Some(files.len()),
109                    });
110                }
111                Ok(parsed)
112            })
113            .collect::<Result<Vec<_>>>()?;
114        on_progress(IndexProgress {
115            phase: "extract",
116            scanned_files: files.len(),
117            indexed_files: files.len(),
118            total_files: Some(files.len()),
119        });
120
121        let mut symbols = parsed
122            .iter()
123            .flat_map(|file| file.symbols.clone())
124            .collect::<Vec<_>>();
125        dedupe_symbols(&mut symbols);
126        let chunks = parsed
127            .iter()
128            .flat_map(|file| file.chunks.clone())
129            .collect::<Vec<_>>();
130        let tests = parsed
131            .iter()
132            .flat_map(|file| file.tests.clone())
133            .collect::<Vec<_>>();
134        let imports = parsed
135            .iter()
136            .flat_map(|file| file.imports.clone())
137            .collect::<Vec<_>>();
138        let mut analysis_facts = parsed
139            .iter()
140            .flat_map(|file| file.analysis_facts.clone())
141            .collect::<Vec<_>>();
142        let static_analysis_facts = analysis_facts.len();
143        on_progress(IndexProgress {
144            phase: "analysis",
145            scanned_files: files.len(),
146            indexed_files: files.len(),
147            total_files: Some(files.len()),
148        });
149        let runtime_facts = collect_runtime_analysis_facts(&root, &files)?;
150        let runtime_analysis_facts = runtime_facts.len();
151        analysis_facts.extend(runtime_facts);
152        let git_history_facts = if config.history.enabled {
153            collect_git_history_facts(
154                &root,
155                &files,
156                config.history.max_commits,
157                config.history.max_files_per_commit,
158            )?
159        } else {
160            Vec::new()
161        };
162        let git_history_fact_count = git_history_facts.len();
163        analysis_facts.extend(git_history_facts);
164        on_progress(IndexProgress {
165            phase: "occurrences",
166            scanned_files: files.len(),
167            indexed_files: files.len(),
168            total_files: Some(files.len()),
169        });
170        let mut occurrences = derive_occurrences(&chunks, &symbols);
171        let mut scip_report = None;
172        if config.scip.enabled {
173            on_progress(IndexProgress {
174                phase: "scip",
175                scanned_files: files.len(),
176                indexed_files: files.len(),
177                total_files: Some(files.len()),
178            });
179            let (imported, report) =
180                open_kioku_scip::prepare_and_import_scip(&root, &config.scip, &repo_id)?;
181            symbols.extend(imported.symbols);
182            dedupe_symbols(&mut symbols);
183            occurrences.extend(imported.occurrences);
184            scip_report = Some(report);
185        }
186        let repository = Repository {
187            id: repo_id,
188            name: config.repo.name.clone(),
189            root: root.clone(),
190            branch: open_kioku_git::branch(&root),
191            commit: open_kioku_git::commit(&root),
192            indexed_at: Some(Utc::now()),
193        };
194        let quality = index_quality(
195            &root,
196            config,
197            scip_report.as_ref(),
198            tests.len(),
199            imports.len(),
200            AnalysisCounts {
201                static_facts: static_analysis_facts,
202                runtime_facts: runtime_analysis_facts,
203                git_history_facts: git_history_fact_count,
204            },
205        );
206        let manifest = IndexManifest {
207            repository,
208            file_count: files.len(),
209            symbol_count: symbols.len(),
210            chunk_count: chunks.len(),
211            indexed_at: Utc::now(),
212            schema_version: 1,
213            quality,
214        };
215        Ok(IndexSnapshot {
216            manifest,
217            files,
218            symbols,
219            chunks,
220            tests,
221            imports,
222            occurrences,
223            analysis_facts,
224            scip: scip_report,
225        })
226    }
227
228    fn scan_files(
229        &self,
230        root: &Path,
231        config: &OkConfig,
232        repository_id: &RepositoryId,
233        on_progress: &dyn Fn(IndexProgress),
234    ) -> Result<Vec<File>> {
235        let max_size = config.max_file_size_bytes()?;
236        let excludes = compile_globs(&config.index.exclude)?;
237        let denied = compile_globs(&config.paths.deny)?;
238        let mut builder = WalkBuilder::new(root);
239        builder.hidden(!config.security.allow_hidden_files);
240        builder.git_ignore(true).git_exclude(true).parents(true);
241        let mut files = Vec::new();
242        let mut scanned_files = 0;
243        on_progress(IndexProgress {
244            phase: "scan",
245            scanned_files,
246            indexed_files: files.len(),
247            total_files: None,
248        });
249        for entry in builder.build() {
250            let entry = entry.map_err(|err| OkError::Index(err.to_string()))?;
251            if !entry
252                .file_type()
253                .map(|kind| kind.is_file())
254                .unwrap_or(false)
255            {
256                continue;
257            }
258            scanned_files += 1;
259            let path = entry.path();
260            let rel = path.strip_prefix(root).unwrap_or(path).to_path_buf();
261            if excludes.is_match(&rel) || denied.is_match(&rel) {
262                if should_emit_progress(scanned_files, 0) {
263                    on_progress(IndexProgress {
264                        phase: "scan",
265                        scanned_files,
266                        indexed_files: files.len(),
267                        total_files: None,
268                    });
269                }
270                continue;
271            }
272            let metadata = entry
273                .metadata()
274                .map_err(|err| OkError::Index(err.to_string()))?;
275            if metadata.len() > max_size {
276                continue;
277            }
278            let language = detect_language(&rel);
279            if !is_supported_code(&language) {
280                continue;
281            }
282            let bytes = fs::read(path)?;
283            if bytes.contains(&0) {
284                continue;
285            }
286            let content = String::from_utf8_lossy(&bytes);
287            let content_hash = hash_bytes(&bytes);
288            files.push(File {
289                id: FileId::new(stable_id(&rel.to_string_lossy())),
290                repository_id: repository_id.clone(),
291                path: rel.clone(),
292                language,
293                size_bytes: metadata.len(),
294                content_hash,
295                is_generated: likely_generated(&content),
296                is_vendor: likely_vendor_path(&rel),
297            });
298            if should_emit_progress(scanned_files, 0) {
299                on_progress(IndexProgress {
300                    phase: "scan",
301                    scanned_files,
302                    indexed_files: files.len(),
303                    total_files: None,
304                });
305            }
306        }
307        on_progress(IndexProgress {
308            phase: "scan",
309            scanned_files,
310            indexed_files: files.len(),
311            total_files: Some(files.len()),
312        });
313        Ok(files)
314    }
315}
316
317#[derive(Debug, Clone, Copy)]
318struct AnalysisCounts {
319    static_facts: usize,
320    runtime_facts: usize,
321    git_history_facts: usize,
322}
323
324fn index_quality(
325    root: &Path,
326    config: &OkConfig,
327    scip_report: Option<&ScipIndexReport>,
328    test_count: usize,
329    import_count: usize,
330    analysis: AnalysisCounts,
331) -> IndexQuality {
332    let mut quality_notes = Vec::new();
333    let build_systems = detect_build_systems(root);
334    let codeql_databases = detect_codeql_databases(root);
335    let coverage_reports = count_analysis_artifacts(root, &["jacoco.xml", "coverage.xml"]);
336    let junit_reports = count_analysis_artifacts(root, &["test-", "junit"]);
337    let mut semantic_provider_notes = Vec::new();
338    if !build_systems.is_empty() {
339        semantic_provider_notes.push(format!(
340            "build systems detected: {}",
341            build_systems.join(", ")
342        ));
343    }
344    if codeql_databases > 0 {
345        semantic_provider_notes.push(format!(
346            "CodeQL database artifacts detected: {codeql_databases}"
347        ));
348    }
349    if coverage_reports > 0 {
350        semantic_provider_notes.push(format!("coverage reports detected: {coverage_reports}"));
351    }
352    if junit_reports > 0 {
353        semantic_provider_notes.push(format!("JUnit-style reports detected: {junit_reports}"));
354    }
355    if analysis.static_facts > 0 {
356        semantic_provider_notes.push(format!(
357            "language static analysis facts detected: {}",
358            analysis.static_facts
359        ));
360    }
361    if analysis.runtime_facts > 0 {
362        semantic_provider_notes.push(format!(
363            "runtime analysis facts detected: {}",
364            analysis.runtime_facts
365        ));
366    }
367    if analysis.git_history_facts > 0 {
368        semantic_provider_notes.push(format!(
369            "git history co-change facts detected: {}",
370            analysis.git_history_facts
371        ));
372    }
373    let scip_mode = format!("{:?}", config.scip.mode).to_ascii_lowercase();
374    if let Some(report) = scip_report {
375        if report.imported_paths.is_empty() {
376            quality_notes.push("SCIP was enabled but no SCIP index was imported".into());
377        }
378        if report.exact_references == 0 {
379            quality_notes.push(
380                "exact reference coverage is unavailable; impact and test selection are heuristic"
381                    .into(),
382            );
383        }
384        for attempt in &report.generator_attempts {
385            if !matches!(
386                attempt.status,
387                open_kioku_scip::ScipGeneratorStatus::Generated
388                    | open_kioku_scip::ScipGeneratorStatus::Skipped
389            ) {
390                quality_notes.push(format!(
391                    "SCIP {} generation {:?}: {}",
392                    attempt.language, attempt.status, attempt.message
393                ));
394            }
395        }
396        IndexQuality {
397            scip_enabled: config.scip.enabled,
398            scip_mode,
399            scip_indexes_imported: report.imported_paths.len(),
400            scip_symbols: report.symbols,
401            scip_occurrences: report.occurrences,
402            scip_exact_references: report.exact_references,
403            test_count,
404            import_count,
405            build_systems,
406            codeql_databases,
407            coverage_reports,
408            junit_reports,
409            static_analysis_facts: analysis.static_facts,
410            runtime_analysis_facts: analysis.runtime_facts,
411            git_history_facts: analysis.git_history_facts,
412            semantic_provider_notes,
413            quality_notes,
414        }
415    } else {
416        if !config.scip.enabled {
417            quality_notes
418                .push("SCIP disabled; symbol references use tree-sitter/import heuristics".into());
419        }
420        IndexQuality {
421            scip_enabled: config.scip.enabled,
422            scip_mode,
423            scip_indexes_imported: 0,
424            scip_symbols: 0,
425            scip_occurrences: 0,
426            scip_exact_references: 0,
427            test_count,
428            import_count,
429            build_systems,
430            codeql_databases,
431            coverage_reports,
432            junit_reports,
433            static_analysis_facts: analysis.static_facts,
434            runtime_analysis_facts: analysis.runtime_facts,
435            git_history_facts: analysis.git_history_facts,
436            semantic_provider_notes,
437            quality_notes,
438        }
439    }
440}
441
442fn collect_git_history_facts(
443    root: &Path,
444    files: &[File],
445    max_commits: usize,
446    max_files_per_commit: usize,
447) -> Result<Vec<AnalysisFact>> {
448    let files_by_path = files
449        .iter()
450        .map(|file| (normalize_history_path(&file.path), file))
451        .collect::<HashMap<_, _>>();
452    let records = open_kioku_git::cochange_records(root, max_commits, max_files_per_commit)?;
453    let mut facts = Vec::new();
454    for record in records {
455        let Some(file) = files_by_path.get(&normalize_history_path(&record.path)) else {
456            continue;
457        };
458        if !files_by_path.contains_key(&normalize_history_path(&record.cochanged_path)) {
459            continue;
460        }
461        let id = stable_id(&format!(
462            "git-history:{}:{}",
463            record.path.display(),
464            record.cochanged_path.display()
465        ));
466        let mut message = format!(
467            "git co-change observed in {} commit(s), recency weight {:.2}",
468            record.commit_count, record.recency_weight
469        );
470        if record.test_corun {
471            message.push_str("; includes historical path-to-test co-run");
472        }
473        facts.push(AnalysisFact {
474            id,
475            file_id: file.id.clone(),
476            symbol_id: None,
477            target: normalize_history_path(&record.cochanged_path),
478            target_kind: if record.test_corun {
479                GraphNodeType::Test
480            } else {
481                GraphNodeType::File
482            },
483            edge_type: GraphEdgeType::ChangedBy,
484            range: None,
485            confidence: Confidence::from_score((0.45 + record.recency_weight / 4.0).min(0.90)),
486            source: format!("git-history:{}", record.commits.join(",")),
487            source_type: EvidenceSourceType::GitHistory,
488            message,
489        });
490        if facts.len() >= 5000 {
491            break;
492        }
493    }
494    Ok(dedupe_analysis_facts(facts))
495}
496
497fn detect_build_systems(root: &Path) -> Vec<String> {
498    let mut systems = Vec::new();
499    for (name, paths) in [
500        (
501            "gradle",
502            &[
503                "settings.gradle",
504                "settings.gradle.kts",
505                "build.gradle",
506                "build.gradle.kts",
507            ][..],
508        ),
509        ("maven", &["pom.xml"][..]),
510        (
511            "bazel",
512            &["WORKSPACE", "WORKSPACE.bazel", "MODULE.bazel"][..],
513        ),
514        ("cargo", &["Cargo.toml"][..]),
515        ("npm", &["package.json"][..]),
516        ("go", &["go.mod"][..]),
517    ] {
518        if paths.iter().any(|path| root.join(path).exists()) {
519            systems.push(name.to_string());
520        }
521    }
522    systems
523}
524
525fn detect_codeql_databases(root: &Path) -> usize {
526    [
527        ".ok/codeql",
528        "codeql-db",
529        "codeql-database",
530        ".codeql/database",
531    ]
532    .iter()
533    .filter(|path| {
534        let path = root.join(path);
535        path.is_dir()
536            && (path.join("db-java").exists()
537                || path.join("codeql-database.yml").exists()
538                || path.join("log").exists())
539    })
540    .count()
541}
542
543fn count_analysis_artifacts(root: &Path, names: &[&str]) -> usize {
544    let candidates = [
545        root.join(".ok/analysis"),
546        root.join("build/reports"),
547        root.join("target/site"),
548        root.join("coverage"),
549    ];
550    let mut count = 0;
551    for candidate in candidates {
552        if !candidate.is_dir() {
553            continue;
554        }
555        for entry in walkdir::WalkDir::new(candidate)
556            .max_depth(5)
557            .into_iter()
558            .filter_map(|entry| entry.ok())
559        {
560            if !entry.file_type().is_file() {
561                continue;
562            }
563            let file_name = entry.file_name().to_string_lossy().to_ascii_lowercase();
564            if names.iter().any(|needle| file_name.contains(needle)) {
565                count += 1;
566            }
567        }
568    }
569    count
570}
571
572fn collect_runtime_analysis_facts(root: &Path, files: &[File]) -> Result<Vec<AnalysisFact>> {
573    let files_by_path = files
574        .iter()
575        .map(|file| (normalize_path(&file.path.to_string_lossy()), file))
576        .collect::<HashMap<_, _>>();
577    let mut facts = Vec::new();
578    for runtime_root in [
579        root.join(".ok/runtime"),
580        root.join(".ok/analysis/runtime"),
581        root.join(".ok/analysis"),
582    ] {
583        if !runtime_root.is_dir() {
584            continue;
585        }
586        for entry in walkdir::WalkDir::new(&runtime_root)
587            .max_depth(3)
588            .into_iter()
589            .filter_map(|entry| entry.ok())
590        {
591            if !entry.file_type().is_file() {
592                continue;
593            }
594            let path = entry.path();
595            let Some(file_name) = path.file_name().and_then(|value| value.to_str()) else {
596                continue;
597            };
598            let lower_name = file_name.to_ascii_lowercase();
599            if !lower_name.ends_with(".jsonl")
600                || !(lower_name.contains("span")
601                    || lower_name.contains("trace")
602                    || lower_name.contains("runtime")
603                    || lower_name.contains("otel")
604                    || lower_name.contains("log")
605                    || lower_name.contains("incident")
606                    || lower_name.contains("error")
607                    || lower_name.contains("failure"))
608            {
609                continue;
610            }
611            let metadata = entry
612                .metadata()
613                .map_err(|err| OkError::Index(err.to_string()))?;
614            if metadata.len() > 5 * 1024 * 1024 {
615                continue;
616            }
617            let content = fs::read_to_string(path)?;
618            for (idx, line) in content.lines().enumerate() {
619                if facts.len() >= 10_000 {
620                    return Ok(dedupe_analysis_facts(facts));
621                }
622                let trimmed = line.trim();
623                if trimmed.is_empty() {
624                    continue;
625                }
626                let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
627                    continue;
628                };
629                let Some(source_file) =
630                    json_string(&value, &["file", "code.filepath", "source.file"])
631                else {
632                    continue;
633                };
634                let normalized = normalize_runtime_file(root, &source_file);
635                let Some(file) = files_by_path.get(&normalized).copied() else {
636                    continue;
637                };
638                let line_number = json_u32(&value, &["line", "code.lineno", "source.line"]);
639                if let Some(fact) = runtime_endpoint_fact(file, &value, line_number, path, idx + 1)
640                {
641                    facts.push(fact);
642                }
643                if let Some(fact) = runtime_table_fact(file, &value, line_number, path, idx + 1) {
644                    facts.push(fact);
645                }
646                if let Some(fact) = runtime_incident_fact(file, &value, line_number, path, idx + 1)
647                {
648                    facts.push(fact);
649                }
650            }
651        }
652    }
653    Ok(dedupe_analysis_facts(facts))
654}
655
656fn runtime_endpoint_fact(
657    file: &File,
658    value: &Value,
659    line_number: Option<u32>,
660    artifact: &Path,
661    artifact_line: usize,
662) -> Option<AnalysisFact> {
663    let route = json_string(
664        value,
665        &[
666            "http.route",
667            "http.target",
668            "url.path",
669            "route",
670            "name",
671            "span.name",
672        ],
673    )?;
674    if !route.contains('/') {
675        return None;
676    }
677    let method = json_string(
678        value,
679        &[
680            "http.request.method",
681            "http.method",
682            "method",
683            "request.method",
684        ],
685    )
686    .unwrap_or_else(|| "HTTP".into())
687    .to_ascii_uppercase();
688    Some(runtime_fact(
689        file,
690        GraphEdgeType::ExposesEndpoint,
691        GraphNodeType::Endpoint,
692        format!("{method} {route}"),
693        line_number,
694        RuntimeFactSource {
695            artifact,
696            artifact_line,
697            message: "runtime endpoint observed in local trace artifact",
698        },
699    ))
700}
701
702fn runtime_table_fact(
703    file: &File,
704    value: &Value,
705    line_number: Option<u32>,
706    artifact: &Path,
707    artifact_line: usize,
708) -> Option<AnalysisFact> {
709    let statement = json_string(value, &["db.statement", "sql", "database.statement"])?;
710    let table = extract_sql_table(&statement)?;
711    Some(runtime_fact(
712        file,
713        GraphEdgeType::ReadsTable,
714        GraphNodeType::DatabaseTable,
715        table,
716        line_number,
717        RuntimeFactSource {
718            artifact,
719            artifact_line,
720            message: "runtime database access observed in local trace artifact",
721        },
722    ))
723}
724
725fn runtime_incident_fact(
726    file: &File,
727    value: &Value,
728    line_number: Option<u32>,
729    artifact: &Path,
730    artifact_line: usize,
731) -> Option<AnalysisFact> {
732    let message = json_string(
733        value,
734        &[
735            "error.message",
736            "exception.message",
737            "log.message",
738            "message",
739            "event.message",
740            "span.status.message",
741            "name",
742            "span.name",
743        ],
744    )?;
745    let signal = compact_runtime_message(&message)?;
746    Some(runtime_fact(
747        file,
748        GraphEdgeType::FailedIn,
749        GraphNodeType::RuntimeError,
750        signal,
751        line_number,
752        RuntimeFactSource {
753            artifact,
754            artifact_line,
755            message: "runtime incident observed in local log or failure artifact",
756        },
757    ))
758}
759
760struct RuntimeFactSource<'a> {
761    artifact: &'a Path,
762    artifact_line: usize,
763    message: &'static str,
764}
765
766fn runtime_fact(
767    file: &File,
768    edge_type: GraphEdgeType,
769    target_kind: GraphNodeType,
770    target: String,
771    line_number: Option<u32>,
772    source: RuntimeFactSource<'_>,
773) -> AnalysisFact {
774    AnalysisFact {
775        id: stable_id(&format!(
776            "runtime:{}:{:?}:{}:{}",
777            file.path.display(),
778            edge_type,
779            target,
780            source.artifact_line
781        )),
782        file_id: file.id.clone(),
783        symbol_id: None,
784        target,
785        target_kind,
786        edge_type,
787        range: line_number.map(LineRange::single),
788        confidence: Confidence::High,
789        source: format!("open-kioku-runtime:{}", source.artifact.display()),
790        source_type: EvidenceSourceType::Runtime,
791        message: source.message.into(),
792    }
793}
794
795fn json_string(value: &Value, keys: &[&str]) -> Option<String> {
796    for key in keys {
797        if let Some(value) = nested_json_value(value, key).and_then(Value::as_str) {
798            return Some(value.to_string());
799        }
800        if let Some(value) = value
801            .get("attributes")
802            .and_then(|attributes| nested_json_value(attributes, key))
803            .and_then(Value::as_str)
804        {
805            return Some(value.to_string());
806        }
807        if let Some(value) = value
808            .get("resource")
809            .and_then(|resource| resource.get("attributes"))
810            .and_then(|attributes| nested_json_value(attributes, key))
811            .and_then(Value::as_str)
812        {
813            return Some(value.to_string());
814        }
815    }
816    None
817}
818
819fn json_u32(value: &Value, keys: &[&str]) -> Option<u32> {
820    for key in keys {
821        if let Some(value) = nested_json_value(value, key)
822            .and_then(Value::as_u64)
823            .and_then(|value| u32::try_from(value).ok())
824        {
825            return Some(value);
826        }
827        if let Some(value) = value
828            .get("attributes")
829            .and_then(|attributes| nested_json_value(attributes, key))
830            .and_then(Value::as_u64)
831            .and_then(|value| u32::try_from(value).ok())
832        {
833            return Some(value);
834        }
835    }
836    None
837}
838
839fn nested_json_value<'a>(value: &'a Value, key: &str) -> Option<&'a Value> {
840    if let Some(exact) = value.get(key) {
841        return Some(exact);
842    }
843    let mut current = value;
844    for segment in key.split('.') {
845        current = current.get(segment)?;
846    }
847    Some(current)
848}
849
850fn normalize_runtime_file(root: &Path, value: &str) -> String {
851    let path = Path::new(value);
852    let rel = if path.is_absolute() {
853        path.strip_prefix(root).unwrap_or(path)
854    } else {
855        path
856    };
857    normalize_path(&rel.to_string_lossy())
858}
859
860fn normalize_path(value: &str) -> String {
861    value.trim_start_matches("./").replace('\\', "/")
862}
863
864fn normalize_history_path(path: &Path) -> String {
865    normalize_path(&path.to_string_lossy())
866}
867
868fn extract_sql_table(statement: &str) -> Option<String> {
869    let lower = statement.to_ascii_lowercase();
870    for keyword in [" from ", " join ", " update ", " into "] {
871        if let Some(index) = lower.find(keyword) {
872            let start = index + keyword.len();
873            let table = statement[start..]
874                .split(|ch: char| !ch.is_ascii_alphanumeric() && ch != '_' && ch != '.')
875                .find(|part| !part.is_empty())?;
876            return Some(table.to_string());
877        }
878    }
879    None
880}
881
882fn compact_runtime_message(message: &str) -> Option<String> {
883    let value = message.trim();
884    if value.is_empty() {
885        return None;
886    }
887    Some(value.chars().take(160).collect())
888}
889
890fn dedupe_analysis_facts(mut facts: Vec<AnalysisFact>) -> Vec<AnalysisFact> {
891    let mut seen = HashSet::new();
892    facts.retain(|fact| seen.insert(fact.id.clone()));
893    facts
894}
895
896fn should_emit_progress(done: usize, total: usize) -> bool {
897    done == total || done % 500 == 0
898}
899
900fn compile_globs(patterns: &[String]) -> Result<GlobSet> {
901    let mut builder = GlobSetBuilder::new();
902    for pattern in patterns {
903        builder.add(Glob::new(pattern).map_err(|err| OkError::Config(err.to_string()))?);
904    }
905    builder
906        .build()
907        .map_err(|err| OkError::Config(err.to_string()))
908}
909
910fn hash_bytes(bytes: &[u8]) -> String {
911    let mut hasher = Sha256::new();
912    hasher.update(bytes);
913    format!("{:x}", hasher.finalize())
914}
915
916fn stable_id(value: &str) -> String {
917    hash_bytes(value.as_bytes())
918}
919
920fn dedupe_symbols(symbols: &mut Vec<Symbol>) {
921    let mut seen = HashSet::new();
922    symbols.retain(|symbol| seen.insert(symbol.id.clone()));
923}
924
925fn derive_occurrences(_chunks: &[CodeChunk], symbols: &[Symbol]) -> Vec<SymbolOccurrence> {
926    let mut occurrences = symbols
927        .iter()
928        .map(|symbol| SymbolOccurrence {
929            symbol_id: symbol.id.clone(),
930            file_id: symbol.file_id.clone(),
931            range: symbol.range.clone(),
932            is_definition: true,
933            confidence: symbol.confidence,
934            provenance: symbol.provenance.clone(),
935        })
936        .collect::<Vec<_>>();
937    occurrences.sort_by(|a, b| {
938        (
939            &a.symbol_id.0,
940            &a.file_id.0,
941            a.range.as_ref().map(|r| r.start),
942            a.is_definition,
943        )
944            .cmp(&(
945                &b.symbol_id.0,
946                &b.file_id.0,
947                b.range.as_ref().map(|r| r.start),
948                b.is_definition,
949            ))
950    });
951    occurrences.dedup_by(|a, b| {
952        a.symbol_id == b.symbol_id
953            && a.file_id == b.file_id
954            && a.range == b.range
955            && a.is_definition == b.is_definition
956    });
957    occurrences
958}
959
960#[cfg(test)]
961mod tests {
962    use super::{derive_occurrences, Indexer};
963    use open_kioku_config::OkConfig;
964    use open_kioku_core::{
965        CodeChunk, Confidence, EvidenceSourceType, FileId, Language, LineRange, Symbol, SymbolId,
966        SymbolKind,
967    };
968    use std::process::Command;
969
970    fn symbol(id: &str, name: &str, line: u32) -> Symbol {
971        Symbol {
972            id: SymbolId::new(id),
973            name: name.into(),
974            qualified_name: format!("src::index::{name}"),
975            kind: SymbolKind::Function,
976            file_id: FileId::new(format!("file-{id}")),
977            range: Some(LineRange::single(line)),
978            language: Language::TypeScript,
979            confidence: Confidence::High,
980            provenance: EvidenceSourceType::TreeSitter,
981        }
982    }
983
984    #[test]
985    fn derive_occurrences_records_definitions_only_for_heuristic_indexing() {
986        let symbols = vec![symbol("retry", "retry", 1), symbol("render", "render", 2)];
987        let chunks = vec![CodeChunk {
988            id: "chunk".into(),
989            file_id: FileId::new("file-chunk"),
990            range: LineRange { start: 10, end: 12 },
991            language: Language::TypeScript,
992            text: "retry(); const retried = true;".into(),
993            symbol_id: None,
994        }];
995
996        let occurrences = derive_occurrences(&chunks, &symbols);
997        let definitions = occurrences
998            .iter()
999            .filter(|occurrence| occurrence.is_definition)
1000            .count();
1001        let references = occurrences
1002            .iter()
1003            .filter(|occurrence| !occurrence.is_definition)
1004            .count();
1005
1006        assert_eq!(definitions, 2);
1007        assert_eq!(references, 0);
1008    }
1009
1010    #[test]
1011    fn index_manifest_records_build_and_analysis_provider_signals() {
1012        let temp = tempfile::tempdir().unwrap();
1013        let root = temp.path();
1014        std::fs::write(root.join("settings.gradle"), "").unwrap();
1015        std::fs::create_dir_all(root.join("src/test/java/org/example")).unwrap();
1016        std::fs::write(
1017            root.join("src/test/java/org/example/ExampleTests.java"),
1018            r#"package org.example;
1019import org.springframework.web.bind.annotation.GetMapping;
1020class ExampleTests extends BaseTests {
1021  @GetMapping("/example")
1022  void works() {
1023    System.getenv("EXAMPLE_REGION");
1024  }
1025}
1026"#,
1027        )
1028        .unwrap();
1029        std::fs::create_dir_all(root.join(".ok/analysis")).unwrap();
1030        std::fs::write(root.join(".ok/analysis/jacoco.xml"), "<report/>").unwrap();
1031        std::fs::write(
1032            root.join(".ok/analysis/TEST-org.example.ExampleTests.xml"),
1033            "<testsuite/>",
1034        )
1035        .unwrap();
1036        std::fs::create_dir_all(root.join(".ok/runtime")).unwrap();
1037        std::fs::write(
1038            root.join(".ok/runtime/spans.jsonl"),
1039            r#"{"file":"src/test/java/org/example/ExampleTests.java","line":4,"attributes":{"http.route":"/example","http.request.method":"GET","db.statement":"select * from example_orders"}}"#,
1040        )
1041        .unwrap();
1042        std::fs::write(
1043            root.join(".ok/runtime/incidents.jsonl"),
1044            r#"{"file":"src/test/java/org/example/ExampleTests.java","line":5,"error.message":"checkout failure after runtime request"}"#,
1045        )
1046        .unwrap();
1047
1048        let mut config = OkConfig::default();
1049        config.scip.enabled = false;
1050        let snapshot = Indexer::default().index_repo(root, &config).unwrap();
1051
1052        assert!(snapshot
1053            .manifest
1054            .quality
1055            .build_systems
1056            .contains(&"gradle".to_string()));
1057        assert_eq!(snapshot.manifest.quality.coverage_reports, 1);
1058        assert_eq!(snapshot.manifest.quality.junit_reports, 1);
1059        assert!(snapshot.manifest.quality.static_analysis_facts >= 3);
1060        assert_eq!(snapshot.manifest.quality.runtime_analysis_facts, 3);
1061        assert!(snapshot
1062            .analysis_facts
1063            .iter()
1064            .any(|fact| fact.target == "GET /example"));
1065        assert!(snapshot
1066            .analysis_facts
1067            .iter()
1068            .any(|fact| fact.target == "example_orders"));
1069        assert!(snapshot
1070            .analysis_facts
1071            .iter()
1072            .any(|fact| fact.target == "checkout failure after runtime request"));
1073        assert!(snapshot
1074            .manifest
1075            .quality
1076            .semantic_provider_notes
1077            .iter()
1078            .any(|note| note.contains("build systems detected")));
1079    }
1080
1081    #[test]
1082    fn index_git_history_facts_can_be_disabled() {
1083        let temp = tempfile::tempdir().unwrap();
1084        let root = temp.path();
1085        git(root, &["init"]);
1086        git(root, &["config", "user.email", "test@example.com"]);
1087        git(root, &["config", "user.name", "Test User"]);
1088        std::fs::create_dir_all(root.join("src")).unwrap();
1089        std::fs::create_dir_all(root.join("tests")).unwrap();
1090        std::fs::write(root.join("src/auth.rs"), "pub fn login() {}\n").unwrap();
1091        std::fs::write(
1092            root.join("tests/auth_test.rs"),
1093            "#[test] fn login_test() {}\n",
1094        )
1095        .unwrap();
1096        git(root, &["add", "."]);
1097        git(root, &["commit", "-m", "auth with tests"]);
1098
1099        let mut enabled = OkConfig::default();
1100        enabled.scip.enabled = false;
1101        let snapshot = Indexer::default().index_repo(root, &enabled).unwrap();
1102        assert!(snapshot.manifest.quality.git_history_facts > 0);
1103        assert!(snapshot
1104            .analysis_facts
1105            .iter()
1106            .any(|fact| fact.source_type == EvidenceSourceType::GitHistory
1107                && fact.target == "tests/auth_test.rs"));
1108
1109        let mut disabled = enabled;
1110        disabled.history.enabled = false;
1111        let snapshot = Indexer::default().index_repo(root, &disabled).unwrap();
1112        assert_eq!(snapshot.manifest.quality.git_history_facts, 0);
1113        assert!(!snapshot
1114            .analysis_facts
1115            .iter()
1116            .any(|fact| fact.source_type == EvidenceSourceType::GitHistory));
1117    }
1118
1119    fn git(root: &std::path::Path, args: &[&str]) {
1120        let status = Command::new("git")
1121            .arg("-C")
1122            .arg(root)
1123            .args(args)
1124            .status()
1125            .unwrap();
1126        assert!(status.success(), "git {args:?} failed");
1127    }
1128}