Skip to main content

seshat_scanner/
orchestrator.rs

1//! Scan orchestration — full and incremental project scan pipeline.
2//!
3//! Coordinates file discovery, parsing, module structure analysis,
4//! manifest analysis, documentation ingestion, and persistence of all
5//! results to the database.
6//!
7//! On re-scan, unchanged files (same content hash) are skipped. Changed
8//! files are re-parsed and their IR updated. New files are parsed and
9//! inserted. Deleted files have their IR removed from the database.
10//! Module structure (nodes + edges) is rebuilt from the full set of
11//! parsed files on every scan.
12
13use std::collections::{HashMap, HashSet};
14use std::path::{Path, PathBuf};
15
16use globset::{Glob, GlobSetBuilder};
17use ignore::WalkBuilder;
18use seshat_core::{BranchId, Edge, EdgeId, NodeId, ProjectFile, ScanConfig};
19use seshat_storage::{
20    BranchMetadataRepository, BranchRepository, Database, EdgeRepository, FileIRRepository,
21    NodeRepository, SqliteBranchMetadataRepository, SqliteBranchRepository, SqliteEdgeRepository,
22    SqliteFileIRRepository, SqliteNodeRepository,
23};
24
25use crate::discovery::discover_files;
26use crate::documentation::parse_documentation;
27use crate::error::ScanError;
28use crate::git_dates::collect_git_file_dates;
29use crate::manifest::{ManifestAnalysis, ManifestType, analyze_manifests};
30use crate::module_structure::build_module_graph;
31use crate::parser::{content_hash, parse_file};
32
33/// Progress events emitted by [`scan_project`].
34///
35/// The callback receives these events at key pipeline stages, allowing
36/// the CLI to drive progress indicators (spinner, progress bar, etc.).
37#[derive(Debug, Clone)]
38pub enum ScanProgress {
39    /// File discovery phase: `count` files found so far.
40    Discovering { count: usize },
41    /// Discovery complete. `total` files will be scanned.
42    DiscoveryDone { total: usize },
43    /// Git history collection phase is starting.
44    CollectingGitHistory,
45    /// Git history collection complete.
46    GitHistoryDone,
47    /// A file has been processed (parsed or skipped). `done` of `total`.
48    Scanning { done: usize, total: usize },
49    /// Scanning (parse) phase complete.
50    ScanningDone,
51    /// Persisting IR and building module graph (steps 4-7).
52    BuildingModuleGraph,
53    /// Module graph build complete.
54    ModuleGraphDone,
55    /// Analyzing manifests and documentation (steps 8-9).
56    AnalyzingProjectFiles,
57    /// Manifest/docs analysis complete.
58    ProjectFilesDone,
59
60    // -- Submodule progress events (emitted by the scan orchestrator in US-004+) --
61    /// A submodule was detected in `.gitmodules`.
62    /// `path` is the relative mount path (e.g. `"vendor/lib"`).
63    SubmoduleDetected { path: String },
64    /// A submodule scan is starting.
65    /// `path` is the relative mount path, `name` is the short directory name.
66    ScanningSubmodule { path: String, name: String },
67    /// A submodule scan completed successfully.
68    /// `path` is the relative mount path.
69    ScanningSubmoduleDone { path: String },
70    /// A submodule is up-to-date (commit hash unchanged since last scan).
71    /// `path` is the relative mount path, `hash` is the current commit hash.
72    SubmoduleUpToDate { path: String, hash: String },
73    /// A submodule was skipped (not initialized, excluded, etc.).
74    /// `path` is the relative mount path, `reason` explains why.
75    SubmoduleSkipped { path: String, reason: String },
76}
77
78/// No-op progress callback — used when caller does not need progress.
79fn noop_progress(_: &ScanProgress) {}
80
81/// Summary of a completed scan operation.
82#[derive(Debug, Clone)]
83pub struct ScanResult {
84    /// Number of source files discovered.
85    pub files_discovered: usize,
86    /// Number of source files parsed (may differ from discovered if some were skipped).
87    pub files_parsed: usize,
88    /// Number of knowledge nodes persisted.
89    pub nodes_persisted: usize,
90    /// Number of edges persisted.
91    pub edges_persisted: usize,
92    /// Number of manifest files analyzed.
93    pub manifests_analyzed: usize,
94    /// Number of documentation files ingested.
95    pub docs_ingested: usize,
96    /// Manifest analysis results (dependency declarations + usage stats).
97    pub manifest_analyses: Vec<ManifestAnalysis>,
98    /// Incremental scan statistics (present on re-scans).
99    pub incremental: Option<IncrementalStats>,
100    /// Git file dates collected during the scan (file path → last commit timestamp).
101    /// Exposed so that callers (e.g., CLI) can use them for trend computation
102    /// without re-running `collect_git_file_dates()`.
103    pub file_dates: HashMap<PathBuf, i64>,
104    /// Submodule paths excluded from root discovery (always excluded — they get
105    /// their own separate DBs). Empty when the project has no `.gitmodules`.
106    pub excluded_submodules: Vec<String>,
107    /// Source content for **every** discovered file (full and incremental scans).
108    ///
109    /// On a **full scan** all files are read and stored here.
110    /// On an **incremental re-scan** all files are still read (we must read
111    /// every file to compute its content hash for change detection anyway), so
112    /// the source is never discarded — it is always kept in this map.
113    ///
114    /// Used by convention detectors to extract real source snippets for
115    /// evidence. Every file in `all_files` will have an entry here, so
116    /// `detect_with_source` is always called (never the IR-only `detect`
117    /// fallback) and snippets are always populated.
118    ///
119    /// Memory note: the map holds the full repo source in memory during the
120    /// detection phase, then is dropped. For typical repos this is negligible.
121    pub source_map: HashMap<PathBuf, String>,
122
123    /// Paths of files that are **new or changed** in this scan.
124    ///
125    /// On a **full scan** this equals all discovered files (every file is new).
126    /// On an **incremental re-scan** this contains only the files whose content
127    /// hash changed or that are newly added.
128    ///
129    /// Used by embedding generation to skip re-embedding unchanged files
130    /// (their embeddings are already current in the `code_embeddings` table).
131    /// Convention detectors use the full `source_map` instead.
132    pub changed_paths: HashSet<PathBuf>,
133}
134
135/// Statistics for an incremental re-scan.
136#[derive(Debug, Clone, Default)]
137pub struct IncrementalStats {
138    /// Files unchanged (same content hash) — skipped re-parsing.
139    pub files_unchanged: usize,
140    /// Files whose content changed — re-parsed and IR updated.
141    pub files_changed: usize,
142    /// New files not in previous scan — parsed and inserted.
143    pub files_new: usize,
144    /// Files deleted since last scan — IR removed from DB.
145    pub files_deleted: usize,
146}
147
148/// Orchestrate a project scan with automatic incremental support.
149///
150/// Convenience wrapper that calls [`scan_project_with_progress`] with a
151/// no-op callback.
152pub fn scan_project(
153    root: &Path,
154    config: &ScanConfig,
155    db: &Database,
156    branch_id: BranchId,
157) -> Result<ScanResult, ScanError> {
158    scan_project_with_progress(root, config, db, noop_progress, branch_id)
159}
160
161/// Orchestrate a project scan with automatic incremental support and
162/// progress reporting.
163///
164/// If the database already contains file IR records for the branch,
165/// the scan runs incrementally:
166/// - Unchanged files (same content hash) are skipped
167/// - Changed files are re-parsed and their IR updated
168/// - New files are parsed and inserted
169/// - Deleted files have their IR removed
170///
171/// Module structure (nodes + edges) is always rebuilt from the full set
172/// of currently-valid parsed files (combining unchanged from DB + newly
173/// parsed).
174///
175/// # Arguments
176///
177/// * `root` - The project root directory to scan.
178/// * `config` - Scan configuration (exclude patterns, file size limit).
179/// * `db` - The database handle for persistence.
180/// * `on_progress` - Callback invoked at key pipeline stages.
181/// * `branch_id` - The git branch identifier to scope all scan data.
182///
183/// # Returns
184///
185/// A [`ScanResult`] summarizing what was persisted.
186pub fn scan_project_with_progress(
187    root: &Path,
188    config: &ScanConfig,
189    db: &Database,
190    on_progress: impl Fn(&ScanProgress),
191    branch_id: BranchId,
192) -> Result<ScanResult, ScanError> {
193    let conn = db.connection().clone();
194    let file_ir_repo = SqliteFileIRRepository::new(conn.clone());
195    let node_repo = SqliteNodeRepository::new(conn.clone());
196    let edge_repo = SqliteEdgeRepository::new(conn.clone());
197    let branch_repo = SqliteBranchRepository::new(conn);
198
199    let branch = branch_id;
200
201    // Register the branch in the `branches` table so freshness checks
202    // and `list_branches` can find it. Idempotent — safe to call on every
203    // scan. (US-003; US-009 layers `set_last_scanned_commit` on top.)
204    branch_repo.ensure_branch_exists(&branch)?;
205
206    // P18+P19: snapshot HEAD BEFORE discovery starts. The sentinel
207    // write at the tail uses this captured value rather than re-reading
208    // HEAD post-scan, so a commit that landed mid-scan cannot fool the
209    // freshness check into thinking the new tree was already indexed.
210    // Doing this inside the orchestrator means every scan path (CLI
211    // scan, watcher rescan, fallback, review sync) gets the right
212    // sentinel automatically — no per-caller wiring required.
213    let head_at_scan_start: Option<String> = crate::git_utils::get_head_commit(root);
214
215    // ------------------------------------------------------------------
216    // Step 1: Discover source files
217    // ------------------------------------------------------------------
218    let discovery_result = discover_files(root, config)?;
219    let discovered = discovery_result.files;
220    let excluded_submodules = discovery_result.excluded_submodules;
221    let files_discovered = discovered.len();
222    on_progress(&ScanProgress::Discovering {
223        count: files_discovered,
224    });
225    on_progress(&ScanProgress::DiscoveryDone {
226        total: files_discovered,
227    });
228    tracing::info!(count = files_discovered, "Discovered source files");
229
230    // ------------------------------------------------------------------
231    // Step 1b: Collect git file dates
232    // ------------------------------------------------------------------
233    on_progress(&ScanProgress::CollectingGitHistory);
234    let git_file_dates = collect_git_file_dates(root)?;
235    on_progress(&ScanProgress::GitHistoryDone);
236    if !git_file_dates.is_empty() {
237        tracing::info!(
238            files_with_dates = git_file_dates.len(),
239            "Collected git file dates"
240        );
241    }
242
243    // ------------------------------------------------------------------
244    // Step 2: Check for existing data (incremental mode)
245    // ------------------------------------------------------------------
246    let stored_hashes = file_ir_repo.get_file_hashes_by_branch(&branch)?;
247    let is_incremental = !stored_hashes.is_empty();
248
249    // Build a set of discovered file paths (relative, as stored in DB)
250    let discovered_paths: HashSet<String> = discovered
251        .iter()
252        .map(|df| df.path.to_string_lossy().to_string())
253        .collect();
254
255    // ------------------------------------------------------------------
256    // Step 3: Read, hash, and selectively parse files
257    // ------------------------------------------------------------------
258    let mut parsed_files: Vec<ProjectFile> = Vec::with_capacity(files_discovered);
259    // source_map holds source for ALL discovered files — unchanged and changed
260    // alike.  Every file is read from disk anyway to compute its content hash,
261    // so keeping the source costs no extra I/O.  Convention detectors need
262    // source for every file to produce real snippets; discarding source for
263    // unchanged files was the root cause of empty snippets in evidence.
264    let mut source_map: HashMap<PathBuf, String> = HashMap::new();
265    // changed_paths tracks only new/changed files so that embedding generation
266    // can skip re-embedding unchanged files (their embeddings are current in DB).
267    let mut changed_paths: HashSet<PathBuf> = HashSet::new();
268    let mut incremental_stats = IncrementalStats::default();
269
270    let mut scan_done: usize = 0;
271    for df in &discovered {
272        // df.path is RELATIVE to root (Bug #3 fix). Use it as the IR key
273        // (so worktree-prefix differences don't fragment files_ir) and join
274        // with root for any I/O.
275        let file_path_str = df.path.to_string_lossy().to_string();
276        let abs_path = root.join(&df.path);
277
278        let source = match std::fs::read_to_string(&abs_path) {
279            Ok(s) => s,
280            Err(e) => {
281                tracing::warn!(path = %abs_path.display(), error = %e, "Failed to read file, skipping");
282                scan_done += 1;
283                on_progress(&ScanProgress::Scanning {
284                    done: scan_done,
285                    total: files_discovered,
286                });
287                continue;
288            }
289        };
290
291        if is_incremental {
292            // Compute hash first to check if file changed
293            let new_hash = content_hash(&source);
294
295            if let Some(stored_hash) = stored_hashes.get(&file_path_str) {
296                if *stored_hash == new_hash {
297                    // Unchanged — skip re-parsing, load existing IR from DB.
298                    // Keep source in source_map so detectors can still produce
299                    // real snippets for this file's evidence entries.
300                    incremental_stats.files_unchanged += 1;
301                    tracing::debug!(path = %df.path.display(), "File unchanged, skipping re-parse");
302                    source_map.insert(df.path.clone(), source);
303                    scan_done += 1;
304                    on_progress(&ScanProgress::Scanning {
305                        done: scan_done,
306                        total: files_discovered,
307                    });
308                    continue;
309                }
310                // Changed — re-parse
311                incremental_stats.files_changed += 1;
312                tracing::debug!(path = %df.path.display(), "File changed, re-parsing");
313            } else {
314                // New file
315                incremental_stats.files_new += 1;
316                tracing::debug!(path = %df.path.display(), "New file, parsing");
317            }
318        }
319
320        // Pass the RELATIVE path to parse_file so ProjectFile.path (which
321        // becomes files_ir.file_path on upsert) is worktree-independent.
322        let mut project_file = parse_file(&df.path, &source, df.language);
323
324        // Strip local project packages from the dependency list so they are
325        // not mistaken for external dependencies by the detectors.
326        // This is most relevant for Python monorepos where `from myawesomeapp.web
327        // import X` looks identical to `from requests import X` syntactically.
328        if !config.local_packages.is_empty() {
329            project_file
330                .dependencies_used
331                .retain(|dep| !config.local_packages.contains(&dep.package));
332        }
333
334        parsed_files.push(project_file);
335        changed_paths.insert(df.path.clone()); // new/changed — needs embedding update
336        source_map.insert(df.path.clone(), source); // keep source alive for detectors
337        scan_done += 1;
338        on_progress(&ScanProgress::Scanning {
339            done: scan_done,
340            total: files_discovered,
341        });
342    }
343    on_progress(&ScanProgress::ScanningDone);
344
345    let files_parsed = parsed_files.len();
346    tracing::info!(count = files_parsed, "Parsed source files");
347
348    on_progress(&ScanProgress::BuildingModuleGraph);
349
350    // ------------------------------------------------------------------
351    // Step 4: Handle deleted files (present in DB but not on disk)
352    //
353    // Symbol-index rows are deleted alongside `files_ir` in one transaction so
354    // the index stays consistent with the IR.  NotFound is swallowed
355    // defensively — the symbol-index half is still attempted inside the same
356    // tx, which also sweeps any orphan rows left by an earlier non-atomic
357    // delete.
358    // ------------------------------------------------------------------
359    if is_incremental {
360        for stored_path in stored_hashes.keys() {
361            if !discovered_paths.contains(stored_path) {
362                tracing::info!(path = %stored_path, "File deleted, removing IR from DB");
363                let _ = file_ir_repo.delete_with_symbol_index(&branch, stored_path);
364                incremental_stats.files_deleted += 1;
365            }
366        }
367    }
368
369    // ------------------------------------------------------------------
370    // Step 5: Persist file IR + symbol-index (new and changed files)
371    //
372    // Each file goes through a single transaction so the IR and the
373    // symbol_definitions / symbol_imports rows commit together — preventing
374    // the index from drifting out of sync with files_ir on a partial write.
375    // ------------------------------------------------------------------
376    for pf in &parsed_files {
377        // git_file_dates keys are relative paths (as returned by gix tree walk).
378        // pf.path is absolute (from WalkBuilder), so we must strip the root
379        // prefix before looking up the commit date.
380        let rel = pf.path.strip_prefix(root).unwrap_or(&pf.path);
381        let commit_date = git_file_dates.get(rel).copied();
382        file_ir_repo.upsert_with_symbol_index(&branch, pf, commit_date)?;
383    }
384    tracing::info!(count = files_parsed, "Stored file IR records");
385
386    // ------------------------------------------------------------------
387    // Step 6: Gather all current parsed files for module graph
388    //
389    // For incremental scans, we need the full set: unchanged files
390    // (loaded from DB) + newly parsed files.
391    // ------------------------------------------------------------------
392    let all_parsed_files = if is_incremental && incremental_stats.files_unchanged > 0 {
393        // Load all IR from DB (which now has the updated set)
394        file_ir_repo.get_by_branch(&branch)?
395    } else {
396        // Fresh scan or all files changed — use what we just parsed
397        parsed_files.clone()
398    };
399
400    // ------------------------------------------------------------------
401    // Step 7: Rebuild module structure graph
402    //
403    // On re-scan, delete old module nodes and edges first, then
404    // re-insert. This is simpler and more correct than trying to diff
405    // the module graph.
406    // ------------------------------------------------------------------
407    if is_incremental {
408        let deleted_edges = edge_repo.delete_by_branch(&branch)?;
409        // Use delete_facts_by_branch (not delete_by_branch) to preserve
410        // user-confirmed conventions and observations written by `seshat review`.
411        let deleted_nodes = node_repo.delete_facts_by_branch(&branch)?;
412        tracing::debug!(
413            nodes = deleted_nodes,
414            edges = deleted_edges,
415            "Cleared old module structure for rebuild"
416        );
417    }
418
419    let module_graph = build_module_graph(root, &all_parsed_files, &branch);
420
421    // Persist module nodes with placeholder → real ID remapping.
422    let mut id_remap: HashMap<NodeId, NodeId> = HashMap::new();
423    let mut nodes_persisted: usize = 0;
424
425    for node in &module_graph.nodes {
426        let inserted = node_repo.insert(node)?;
427        id_remap.insert(node.id, inserted.id);
428        nodes_persisted += 1;
429    }
430
431    // Persist module edges with remapped source/target IDs.
432    let mut edges_persisted: usize = 0;
433
434    for edge in &module_graph.edges {
435        let remapped_edge = remap_edge(edge, &id_remap);
436        edge_repo.insert(&remapped_edge)?;
437        edges_persisted += 1;
438    }
439
440    tracing::info!(
441        nodes = nodes_persisted,
442        edges = edges_persisted,
443        "Persisted module structure"
444    );
445
446    on_progress(&ScanProgress::ModuleGraphDone);
447    on_progress(&ScanProgress::AnalyzingProjectFiles);
448
449    // ------------------------------------------------------------------
450    // Step 8: Discover and analyze dependency manifests
451    // ------------------------------------------------------------------
452    let manifests = discover_manifests(root)?;
453    let manifests_analyzed = manifests.len();
454
455    let manifest_analyses = if !manifests.is_empty() {
456        let analysis = analyze_manifests(&manifests, &all_parsed_files)?;
457        tracing::info!(count = analysis.len(), "Analyzed dependency manifests");
458        analysis
459    } else {
460        Vec::new()
461    };
462
463    // ------------------------------------------------------------------
464    // Step 8b: Persist auto-detected internal names to branch_metadata
465    //
466    // Collect all internal_names from manifest analyses, union with
467    // config.local_packages (normalising hyphens to underscores), and write
468    // as a JSON array under the "workspace_crates" key, scoped to the
469    // current branch_id, so the graph layer can read them at query time
470    // without cross-branch contamination.
471    //
472    // Only writes when names are non-empty — an empty list on re-scan would
473    // erase previously valid names from a prior scan.
474    // ------------------------------------------------------------------
475    {
476        let mut internal_names: Vec<String> = manifest_analyses
477            .iter()
478            .flat_map(|a| a.internal_names.iter().cloned())
479            .filter(|n| !n.trim().is_empty())
480            .collect();
481
482        // Union with config.local_packages — normalise hyphens to underscores
483        // so they match the normalised crate/package names from manifests.
484        // Dedup with a set, preserving order of auto-detected names first.
485        let mut seen: HashSet<String> = internal_names.iter().cloned().collect();
486        for pkg in &config.local_packages {
487            let normalised = pkg.trim().replace('-', "_");
488            if !normalised.is_empty() && seen.insert(normalised.clone()) {
489                internal_names.push(normalised);
490            }
491        }
492
493        if internal_names.is_empty() {
494            tracing::debug!("No internal names to persist — skipping workspace_crates write");
495        } else {
496            let json = serde_json::to_string(&internal_names).unwrap_or_else(|e| {
497                tracing::warn!(error = %e, "Failed to serialise workspace_crates, storing []");
498                "[]".to_owned()
499            });
500
501            let branch_meta = SqliteBranchMetadataRepository::new(db.connection().clone());
502            if let Err(e) = branch_meta.set(&branch.0, "workspace_crates", &json) {
503                tracing::warn!(error = %e, "Failed to persist workspace_crates to branch_metadata");
504            } else {
505                tracing::info!(
506                    count = internal_names.len(),
507                    branch_id = %branch.0,
508                    "Persisted workspace_crates to branch_metadata"
509                );
510            }
511        }
512    }
513
514    // ------------------------------------------------------------------
515    // Step 8c: Persist tsconfig.json path aliases to branch_metadata
516    //
517    // Same per-branch contract as Step 8b: write the JSON-encoded aliases under
518    // "tsconfig_path_aliases" so the graph layer resolves aliased imports at
519    // query time. Only writes when non-empty — an empty list on re-scan would
520    // erase aliases captured by a prior scan.
521    // ------------------------------------------------------------------
522    {
523        let path_aliases: Vec<_> = manifest_analyses
524            .iter()
525            .flat_map(|a| a.path_aliases.iter().cloned())
526            .collect();
527
528        if path_aliases.is_empty() {
529            tracing::debug!("No path aliases to persist — skipping tsconfig_path_aliases write");
530        } else {
531            let json = serde_json::to_string(&path_aliases).unwrap_or_else(|e| {
532                tracing::warn!(error = %e, "Failed to serialise tsconfig_path_aliases, storing []");
533                "[]".to_owned()
534            });
535
536            let branch_meta = SqliteBranchMetadataRepository::new(db.connection().clone());
537            if let Err(e) = branch_meta.set(&branch.0, "tsconfig_path_aliases", &json) {
538                tracing::warn!(error = %e, "Failed to persist tsconfig_path_aliases to branch_metadata");
539            } else {
540                tracing::info!(
541                    count = path_aliases.len(),
542                    branch_id = %branch.0,
543                    "Persisted tsconfig_path_aliases to branch_metadata"
544                );
545            }
546        }
547    }
548
549    // ------------------------------------------------------------------
550    // Step 9: Discover and parse documentation files
551    // ------------------------------------------------------------------
552    let doc_files = discover_documentation(root, config)?;
553    let docs_ingested = doc_files.len();
554
555    for (doc_path, doc_content) in &doc_files {
556        match parse_documentation(doc_path, doc_content, &branch) {
557            Ok(doc_result) => {
558                for node in &doc_result.nodes {
559                    node_repo.insert(node)?;
560                    nodes_persisted += 1;
561                }
562            }
563            Err(e) => {
564                tracing::warn!(
565                    path = %doc_path.display(),
566                    error = %e,
567                    "Failed to parse documentation, skipping"
568                );
569            }
570        }
571    }
572
573    tracing::info!(
574        count = docs_ingested,
575        nodes = nodes_persisted,
576        "Ingested documentation"
577    );
578
579    on_progress(&ScanProgress::ProjectFilesDone);
580
581    // P19: record the freshness sentinel at the END of the scan, using
582    // the HEAD captured BEFORE discovery (P18). Storage errors here are
583    // logged but not fatal — the scan itself succeeded and shouldn't
584    // regress because the sentinel write hit a transient SQLite error.
585    if let Some(head) = head_at_scan_start.as_deref()
586        && let Err(e) = branch_repo.set_last_scanned_commit(&branch, head)
587    {
588        tracing::warn!(
589            error = %e,
590            branch = %branch.0,
591            "scan_project: failed to record last_scanned_commit; \
592             freshness gate may re-trigger sync next startup"
593        );
594    }
595
596    Ok(ScanResult {
597        files_discovered,
598        files_parsed,
599        nodes_persisted,
600        edges_persisted,
601        manifests_analyzed,
602        docs_ingested,
603        manifest_analyses,
604        incremental: if is_incremental {
605            Some(incremental_stats)
606        } else {
607            None
608        },
609        file_dates: git_file_dates,
610        excluded_submodules,
611        source_map,
612        changed_paths,
613    })
614}
615
616/// Remap an edge's source and target IDs using the placeholder → real ID mapping.
617///
618/// If an ID is not found in the mapping (shouldn't happen in normal flow),
619/// the original ID is preserved.
620fn remap_edge(edge: &Edge, id_remap: &HashMap<NodeId, NodeId>) -> Edge {
621    Edge {
622        id: EdgeId(0), // DB will assign real ID
623        source_id: id_remap
624            .get(&edge.source_id)
625            .copied()
626            .unwrap_or(edge.source_id),
627        target_id: id_remap
628            .get(&edge.target_id)
629            .copied()
630            .unwrap_or(edge.target_id),
631        edge_type: edge.edge_type,
632        branch_id: edge.branch_id.clone(),
633        weight: edge.weight,
634        metadata: edge.metadata.clone(),
635    }
636}
637
638/// Discover dependency manifest files in the project root directory.
639///
640/// Looks for known manifest filenames (`Cargo.toml`, `package.json`,
641/// `pyproject.toml`) in the root directory only (not recursively).
642fn discover_manifests(root: &Path) -> Result<Vec<(PathBuf, String, ManifestType)>, ScanError> {
643    let mut manifests = Vec::new();
644
645    for filename in ManifestType::all_filenames() {
646        let path = root.join(filename);
647        if path.is_file() {
648            let content = std::fs::read_to_string(&path).map_err(|e| ScanError::ManifestError {
649                path: path.clone(),
650                reason: format!("Failed to read manifest: {e}"),
651            })?;
652
653            if let Some(manifest_type) = ManifestType::from_filename(filename) {
654                manifests.push((path, content, manifest_type));
655            }
656        }
657    }
658
659    Ok(manifests)
660}
661
662/// Discover documentation files in the project.
663///
664/// Uses the same [`WalkBuilder`] infrastructure as source-file discovery so
665/// that `.gitignore`, hidden files, and `config.exclude_paths` are all
666/// respected consistently across every discovery flow.
667///
668/// Only `.md` (always), `.json` (JSON Schema only), `.yaml`/`.yml`
669/// (OpenAPI only) files are returned.
670fn discover_documentation(
671    root: &Path,
672    config: &ScanConfig,
673) -> Result<Vec<(PathBuf, String)>, ScanError> {
674    let doc_extensions = ["md", "json", "yaml", "yml"];
675
676    // Build a GlobSet from exclude_paths so we can efficiently check each
677    // relative path against the user-configured exclusions.
678    let exclude_globset = {
679        let mut builder = GlobSetBuilder::new();
680        for pattern in &config.exclude_paths {
681            let glob = Glob::new(pattern).map_err(|e| ScanError::DiscoveryError {
682                path: root.to_path_buf(),
683                reason: format!("Invalid exclude_paths pattern '{pattern}': {e}"),
684            })?;
685            builder.add(glob);
686        }
687        builder.build().map_err(|e| ScanError::DiscoveryError {
688            path: root.to_path_buf(),
689            reason: format!("Failed to build exclude globset: {e}"),
690        })?
691    };
692
693    let mut doc_files = Vec::new();
694
695    let walker = WalkBuilder::new(root)
696        .hidden(true) // skip hidden files/dirs (respects .gitignore convention)
697        .git_ignore(true) // respect .gitignore
698        .git_global(true) // respect global gitignore
699        .git_exclude(true) // respect .git/info/exclude
700        .build();
701
702    for entry_result in walker {
703        let entry = match entry_result {
704            Ok(e) => e,
705            Err(err) => {
706                tracing::warn!("Doc walk error: {err}");
707                continue;
708            }
709        };
710
711        // Only process regular files.
712        let Some(file_type) = entry.file_type() else {
713            continue;
714        };
715        if !file_type.is_file() {
716            continue;
717        }
718
719        let path = entry.path();
720
721        // Check extension first (cheap filter).
722        let ext = match path.extension().and_then(|e| e.to_str()) {
723            Some(e) => e,
724            None => continue,
725        };
726        if !doc_extensions.contains(&ext) {
727            continue;
728        }
729
730        // Compute relative path and check against exclude_paths.
731        let relative = path.strip_prefix(root).unwrap_or(path).to_path_buf();
732        if !exclude_globset.is_empty() && exclude_globset.is_match(&relative) {
733            tracing::debug!(
734                path = %relative.display(),
735                "Skipping doc file (matched exclude_paths)"
736            );
737            continue;
738        }
739
740        // Read content and validate format.
741        let content = match std::fs::read_to_string(path) {
742            Ok(c) => c,
743            Err(e) => {
744                tracing::warn!(path = %path.display(), error = %e, "Cannot read doc file");
745                continue;
746            }
747        };
748
749        // For JSON and YAML, only ingest if they match a supported doc format.
750        if (ext == "json" || ext == "yaml" || ext == "yml")
751            && !is_documentation_content(ext, &content)
752        {
753            continue;
754        }
755
756        doc_files.push((relative, content));
757    }
758
759    Ok(doc_files)
760}
761
762/// Check if file content matches a known documentation format.
763///
764/// JSON files must look like a JSON Schema (have `$schema`, `properties`, or
765/// `type` + `title`). YAML files must have `openapi` or `swagger` top-level key.
766fn is_documentation_content(ext: &str, content: &str) -> bool {
767    match ext {
768        "json" => {
769            // Check for JSON Schema indicators
770            let Ok(value) = serde_json::from_str::<serde_json::Value>(content) else {
771                return false;
772            };
773            let obj = match value.as_object() {
774                Some(o) => o,
775                None => return false,
776            };
777            obj.contains_key("$schema")
778                || obj.contains_key("properties")
779                || (obj.contains_key("type") && obj.contains_key("title"))
780        }
781        "yaml" | "yml" => {
782            // Check for OpenAPI/Swagger indicators
783            let Ok(value) = serde_norway::from_str::<serde_norway::Value>(content) else {
784                return false;
785            };
786            let mapping = match value.as_mapping() {
787                Some(m) => m,
788                None => return false,
789            };
790            let has_openapi =
791                mapping.contains_key(serde_norway::Value::String("openapi".to_string()));
792            let has_swagger =
793                mapping.contains_key(serde_norway::Value::String("swagger".to_string()));
794            has_openapi || has_swagger
795        }
796        _ => false,
797    }
798}
799
800#[cfg(test)]
801mod tests {
802    use super::*;
803    use seshat_core::ScanConfig;
804    use seshat_storage::{Database, RepoMetadataRepository};
805    use std::fs;
806    use tempfile::tempdir;
807
808    /// Helper: create a minimal project in a temp directory for testing.
809    fn create_test_project() -> tempfile::TempDir {
810        let dir = tempdir().expect("create tempdir");
811        let root = dir.path();
812
813        // Create .git directory so WalkBuilder activates .gitignore parsing
814        fs::create_dir_all(root.join(".git")).unwrap();
815
816        // Create source files
817        let src = root.join("src");
818        fs::create_dir_all(&src).unwrap();
819
820        fs::write(
821            src.join("main.rs"),
822            r#"
823use std::io;
824use crate::config::Config;
825
826pub fn main() {
827    println!("hello");
828}
829
830fn helper() -> bool {
831    true
832}
833"#,
834        )
835        .unwrap();
836
837        fs::write(
838            src.join("config.rs"),
839            r#"
840pub struct Config {
841    pub name: String,
842    pub debug: bool,
843}
844
845impl Config {
846    pub fn new() -> Self {
847        Config {
848            name: String::new(),
849            debug: false,
850        }
851    }
852}
853"#,
854        )
855        .unwrap();
856
857        // Create a subdirectory with another file
858        let utils = src.join("utils");
859        fs::create_dir_all(&utils).unwrap();
860
861        fs::write(
862            utils.join("format.rs"),
863            r#"
864use crate::config::Config;
865
866pub fn format_name(config: &Config) -> String {
867    config.name.clone()
868}
869"#,
870        )
871        .unwrap();
872
873        // Create a markdown doc
874        fs::write(
875            root.join("README.md"),
876            r#"# Test Project
877
878## Overview
879A simple test project.
880
881## Features
882- Feature one
883- Feature two
884"#,
885        )
886        .unwrap();
887
888        dir
889    }
890
891    #[test]
892    fn scan_project_discovers_and_parses_files() {
893        let dir = create_test_project();
894        let root = dir.path();
895        let db = Database::open(":memory:").expect("open DB");
896        let config = ScanConfig::default();
897
898        let result =
899            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
900
901        assert_eq!(result.files_discovered, 3, "should discover 3 .rs files");
902        assert_eq!(result.files_parsed, 3, "should parse all 3 files");
903    }
904
905    #[test]
906    fn scan_project_stores_ir_in_database() {
907        let dir = create_test_project();
908        let root = dir.path();
909        let db = Database::open(":memory:").expect("open DB");
910        let config = ScanConfig::default();
911
912        scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
913
914        // Verify IR records exist in database
915        let conn = db.connection().clone();
916        let file_ir_repo = SqliteFileIRRepository::new(conn);
917        let branch_id = BranchId::from("main");
918
919        let all_files = file_ir_repo.get_by_branch(&branch_id).expect("get files");
920        assert_eq!(all_files.len(), 3, "should have 3 file IR records");
921    }
922
923    #[test]
924    fn scan_project_stores_content_hash() {
925        let dir = create_test_project();
926        let root = dir.path();
927        let db = Database::open(":memory:").expect("open DB");
928        let config = ScanConfig::default();
929
930        scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
931
932        // Verify content hashes are stored
933        let conn = db.connection().clone();
934        let file_ir_repo = SqliteFileIRRepository::new(conn);
935        let branch_id = BranchId::from("main");
936
937        let all_files = file_ir_repo.get_by_branch(&branch_id).expect("get files");
938        for pf in &all_files {
939            assert!(
940                !pf.content_hash.is_empty(),
941                "content hash should be non-empty for {}",
942                pf.path.display()
943            );
944        }
945    }
946
947    #[test]
948    fn scan_project_persists_module_nodes() {
949        let dir = create_test_project();
950        let root = dir.path();
951        let db = Database::open(":memory:").expect("open DB");
952        let config = ScanConfig::default();
953
954        let result =
955            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
956
957        // We have files in src/ and src/utils/, so should have at least 2 module nodes
958        assert!(
959            result.nodes_persisted >= 2,
960            "should persist at least 2 module nodes, got {}",
961            result.nodes_persisted
962        );
963
964        // Verify nodes exist in DB
965        let conn = db.connection().clone();
966        let node_repo = SqliteNodeRepository::new(conn);
967        let branch_id = BranchId::from("main");
968
969        let nodes = node_repo.find_by_branch(&branch_id).expect("find nodes");
970        assert!(
971            nodes.len() >= 2,
972            "should have at least 2 nodes in DB, got {}",
973            nodes.len()
974        );
975    }
976
977    #[test]
978    fn scan_project_persists_edges() {
979        let dir = create_test_project();
980        let root = dir.path();
981        let db = Database::open(":memory:").expect("open DB");
982        let config = ScanConfig::default();
983
984        let result =
985            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
986
987        // Should have PartOf edges at least (src/utils PartOf src)
988        assert!(
989            result.edges_persisted >= 1,
990            "should persist at least 1 edge, got {}",
991            result.edges_persisted
992        );
993
994        // Verify edges exist in DB
995        let conn = db.connection().clone();
996        let edge_repo = SqliteEdgeRepository::new(conn);
997
998        let part_of_edges = edge_repo
999            .find_by_type(seshat_core::EdgeType::PartOf)
1000            .expect("find PartOf edges");
1001        assert!(
1002            !part_of_edges.is_empty(),
1003            "should have at least 1 PartOf edge"
1004        );
1005    }
1006
1007    #[test]
1008    fn scan_project_ingests_documentation() {
1009        let dir = create_test_project();
1010        let root = dir.path();
1011        let db = Database::open(":memory:").expect("open DB");
1012        let config = ScanConfig::default();
1013
1014        let result =
1015            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
1016
1017        assert!(
1018            result.docs_ingested >= 1,
1019            "should ingest at least 1 documentation file (README.md), got {}",
1020            result.docs_ingested
1021        );
1022    }
1023
1024    #[test]
1025    fn scan_project_empty_directory() {
1026        let dir = tempdir().expect("create tempdir");
1027        let root = dir.path();
1028
1029        // Create .git so WalkBuilder works
1030        fs::create_dir_all(root.join(".git")).unwrap();
1031
1032        let db = Database::open(":memory:").expect("open DB");
1033        let config = ScanConfig::default();
1034
1035        let result = scan_project(root, &config, &db, BranchId::from("main"))
1036            .expect("scan should succeed on empty project");
1037
1038        assert_eq!(result.files_discovered, 0);
1039        assert_eq!(result.files_parsed, 0);
1040        assert_eq!(result.nodes_persisted, 0);
1041        assert_eq!(result.edges_persisted, 0);
1042    }
1043
1044    #[test]
1045    fn scan_project_respects_config_exclude_paths() {
1046        let dir = create_test_project();
1047        let root = dir.path();
1048
1049        // Exclude utils/ directory
1050        let config = ScanConfig {
1051            exclude_paths: vec!["**/utils/**".to_string()],
1052            ..ScanConfig::default()
1053        };
1054
1055        let db = Database::open(":memory:").expect("open DB");
1056
1057        let result =
1058            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
1059
1060        // Should only discover main.rs and config.rs (not utils/format.rs)
1061        assert_eq!(
1062            result.files_discovered, 2,
1063            "should discover 2 files (utils excluded)"
1064        );
1065    }
1066
1067    #[test]
1068    fn discover_manifests_finds_cargo_toml() {
1069        let dir = tempdir().expect("create tempdir");
1070        let root = dir.path();
1071
1072        fs::write(
1073            root.join("Cargo.toml"),
1074            r#"[package]
1075name = "test"
1076version = "0.1.0"
1077edition = "2021"
1078"#,
1079        )
1080        .unwrap();
1081
1082        let manifests = discover_manifests(root).expect("discover manifests");
1083        assert_eq!(manifests.len(), 1);
1084        assert_eq!(manifests[0].2, ManifestType::CargoToml);
1085    }
1086
1087    #[test]
1088    fn discover_manifests_finds_nothing_without_manifests() {
1089        let dir = tempdir().expect("create tempdir");
1090        let manifests = discover_manifests(dir.path()).expect("discover manifests");
1091        assert!(manifests.is_empty());
1092    }
1093
1094    #[test]
1095    fn is_documentation_content_json_schema() {
1096        let content = r#"{"$schema": "http://json-schema.org/draft-07/schema#", "type": "object"}"#;
1097        assert!(is_documentation_content("json", content));
1098
1099        let content = r#"{"name": "foo", "value": 42}"#;
1100        assert!(!is_documentation_content("json", content));
1101    }
1102
1103    #[test]
1104    fn is_documentation_content_openapi() {
1105        let content = "openapi: '3.0.0'\ninfo:\n  title: Test\n  version: '1.0'\npaths: {}";
1106        assert!(is_documentation_content("yaml", content));
1107
1108        let content = "name: test\nvalue: 42";
1109        assert!(!is_documentation_content("yaml", content));
1110    }
1111
1112    #[test]
1113    fn remap_edge_applies_id_mapping() {
1114        let mut remap = HashMap::new();
1115        remap.insert(NodeId(1), NodeId(100));
1116        remap.insert(NodeId(2), NodeId(200));
1117
1118        let edge = Edge {
1119            id: EdgeId(0),
1120            source_id: NodeId(1),
1121            target_id: NodeId(2),
1122            edge_type: seshat_core::EdgeType::DependsOn,
1123            branch_id: BranchId::from("main"),
1124            weight: 1.0,
1125            metadata: None,
1126        };
1127
1128        let remapped = remap_edge(&edge, &remap);
1129        assert_eq!(remapped.source_id, NodeId(100));
1130        assert_eq!(remapped.target_id, NodeId(200));
1131    }
1132
1133    #[test]
1134    fn scan_project_incremental_skips_unchanged() {
1135        let dir = create_test_project();
1136        let root = dir.path();
1137        let db = Database::open(":memory:").expect("open DB");
1138        let config = ScanConfig::default();
1139
1140        // Initial scan
1141        let r1 = scan_project(root, &config, &db, BranchId::from("main")).expect("first scan");
1142        assert!(r1.incremental.is_none(), "first scan is not incremental");
1143        assert_eq!(r1.files_parsed, 3);
1144
1145        // Re-scan without changes
1146        let r2 = scan_project(root, &config, &db, BranchId::from("main")).expect("second scan");
1147        assert!(r2.incremental.is_some(), "second scan is incremental");
1148        let stats = r2.incremental.unwrap();
1149        assert_eq!(stats.files_unchanged, 3);
1150        assert_eq!(stats.files_changed, 0);
1151        assert_eq!(stats.files_new, 0);
1152        assert_eq!(stats.files_deleted, 0);
1153        assert_eq!(r2.files_parsed, 0, "no files re-parsed");
1154    }
1155
1156    #[test]
1157    fn scan_project_incremental_detects_modification() {
1158        let dir = create_test_project();
1159        let root = dir.path();
1160        let db = Database::open(":memory:").expect("open DB");
1161        let config = ScanConfig::default();
1162
1163        // Initial scan
1164        scan_project(root, &config, &db, BranchId::from("main")).expect("first scan");
1165
1166        // Modify a file
1167        fs::write(
1168            root.join("src/config.rs"),
1169            "pub struct Config { pub name: String, pub extra: bool }\n",
1170        )
1171        .unwrap();
1172
1173        // Re-scan
1174        let r2 = scan_project(root, &config, &db, BranchId::from("main")).expect("second scan");
1175        let stats = r2.incremental.unwrap();
1176        assert_eq!(stats.files_changed, 1, "config.rs changed");
1177        assert_eq!(stats.files_unchanged, 2, "main.rs + format.rs unchanged");
1178        assert_eq!(r2.files_parsed, 1, "only changed file parsed");
1179    }
1180
1181    #[test]
1182    fn scan_project_incremental_detects_addition() {
1183        let dir = create_test_project();
1184        let root = dir.path();
1185        let db = Database::open(":memory:").expect("open DB");
1186        let config = ScanConfig::default();
1187
1188        scan_project(root, &config, &db, BranchId::from("main")).expect("first scan");
1189
1190        // Add a new file
1191        fs::write(root.join("src/extra.rs"), "pub fn extra() {}").unwrap();
1192
1193        let r2 = scan_project(root, &config, &db, BranchId::from("main")).expect("second scan");
1194        let stats = r2.incremental.unwrap();
1195        assert_eq!(stats.files_new, 1);
1196        assert_eq!(stats.files_unchanged, 3);
1197        assert_eq!(r2.files_discovered, 4);
1198    }
1199
1200    #[test]
1201    fn scan_project_incremental_detects_deletion() {
1202        let dir = create_test_project();
1203        let root = dir.path();
1204        let db = Database::open(":memory:").expect("open DB");
1205        let config = ScanConfig::default();
1206
1207        scan_project(root, &config, &db, BranchId::from("main")).expect("first scan");
1208
1209        // Delete a file
1210        fs::remove_file(root.join("src/utils/format.rs")).unwrap();
1211
1212        let r2 = scan_project(root, &config, &db, BranchId::from("main")).expect("second scan");
1213        let stats = r2.incremental.unwrap();
1214        assert_eq!(stats.files_deleted, 1);
1215        assert_eq!(stats.files_unchanged, 2);
1216        assert_eq!(r2.files_discovered, 2);
1217
1218        // Verify DB no longer has the deleted file
1219        let conn = db.connection().clone();
1220        let file_ir_repo = SqliteFileIRRepository::new(conn);
1221        let branch = BranchId::from("main");
1222        let files = file_ir_repo.get_by_branch(&branch).unwrap();
1223        assert_eq!(files.len(), 2);
1224    }
1225
1226    // ── source_map / changed_paths regression tests ───────────────────────────
1227    //
1228    // These tests pin the contract that prevents the "empty snippets" regression:
1229    // source_map must always contain ALL discovered files (so detectors can call
1230    // detect_with_source for every file), and changed_paths must contain only
1231    // the new/changed files (so embeddings are not regenerated unnecessarily).
1232
1233    #[test]
1234    fn full_scan_source_map_contains_all_files() {
1235        let dir = create_test_project();
1236        let root = dir.path();
1237        let db = Database::open(":memory:").expect("open DB");
1238        let config = ScanConfig::default();
1239
1240        let result =
1241            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
1242
1243        // On a full scan every discovered file must be in source_map.
1244        assert_eq!(
1245            result.source_map.len(),
1246            result.files_discovered,
1247            "source_map must contain all {} discovered files on full scan, got {}",
1248            result.files_discovered,
1249            result.source_map.len()
1250        );
1251        // On a full scan all files are "new" → changed_paths == all files.
1252        assert_eq!(
1253            result.changed_paths.len(),
1254            result.files_discovered,
1255            "changed_paths must equal files_discovered on full scan"
1256        );
1257        // Every source must be non-empty (real file content).
1258        for (path, src) in &result.source_map {
1259            assert!(!src.is_empty(), "source for {:?} must not be empty", path);
1260        }
1261    }
1262
1263    #[test]
1264    fn incremental_scan_source_map_contains_all_files() {
1265        // This is the regression test for the "empty snippets" bug:
1266        // on an incremental re-scan with no file changes, source_map must
1267        // still contain ALL files so that detect_with_source is called for
1268        // every file and snippets are populated.
1269        let dir = create_test_project();
1270        let root = dir.path();
1271        let db = Database::open(":memory:").expect("open DB");
1272        let config = ScanConfig::default();
1273
1274        // Initial full scan.
1275        scan_project(root, &config, &db, BranchId::from("main")).expect("first scan");
1276
1277        // Re-scan with NO file changes.
1278        let r2 = scan_project(root, &config, &db, BranchId::from("main")).expect("second scan");
1279        let stats = r2.incremental.as_ref().unwrap();
1280
1281        assert_eq!(stats.files_unchanged, 3, "all 3 files should be unchanged");
1282        assert_eq!(r2.files_parsed, 0, "no files should be re-parsed");
1283
1284        // KEY ASSERTION: source_map must still contain all files despite no re-parsing.
1285        assert_eq!(
1286            r2.source_map.len(),
1287            r2.files_discovered,
1288            "source_map must contain all {} files on incremental scan (no changes), got {} — \
1289             this would cause empty snippets in convention evidence",
1290            r2.files_discovered,
1291            r2.source_map.len()
1292        );
1293
1294        // changed_paths must be empty — no files changed.
1295        assert!(
1296            r2.changed_paths.is_empty(),
1297            "changed_paths must be empty when no files changed, got {} paths",
1298            r2.changed_paths.len()
1299        );
1300
1301        // Every source in the map must be non-empty.
1302        for (path, src) in &r2.source_map {
1303            assert!(
1304                !src.is_empty(),
1305                "source for {:?} must not be empty on incremental scan",
1306                path
1307            );
1308        }
1309    }
1310
1311    #[test]
1312    fn scan_persists_workspace_crates_with_local_packages_union() {
1313        // Verify that after a scan, the per-branch
1314        // branch_metadata["workspace_crates"] entry contains auto-detected
1315        // crate names (from Cargo.toml) UNIONED with any
1316        // config.local_packages entries, deduplicated. `workspace_crates`
1317        // is keyed by branch_id, not stored in the global repo_metadata slot.
1318        let dir = tempdir().expect("create tempdir");
1319        let root = dir.path();
1320
1321        // Create a minimal .git directory so WalkBuilder works
1322        fs::create_dir_all(root.join(".git")).unwrap();
1323
1324        // Write a Cargo.toml with a crate name
1325        fs::write(
1326            root.join("Cargo.toml"),
1327            r#"[package]
1328name = "auto-detected-crate"
1329version = "0.1.0"
1330edition = "2021"
1331"#,
1332        )
1333        .unwrap();
1334
1335        // Write a dummy Rust source file so the scanner has something to parse
1336        let src = root.join("src");
1337        fs::create_dir_all(&src).unwrap();
1338        fs::write(src.join("lib.rs"), "pub fn hello() {}\n").unwrap();
1339
1340        let config = ScanConfig {
1341            local_packages: vec![
1342                // User types hyphens; orchestrator normalises to underscores
1343                "extra-package".to_owned(),
1344                // Duplicate of auto-detected (hyphens normalised to underscores)
1345                "auto_detected_crate".to_owned(),
1346            ],
1347            ..ScanConfig::default()
1348        };
1349
1350        let db = Database::open(":memory:").expect("open DB");
1351        let branch = BranchId::from("main");
1352        scan_project(root, &config, &db, branch.clone()).expect("scan should succeed");
1353
1354        // Read workspace_crates back from the per-branch branch_metadata slot.
1355        let branch_meta = SqliteBranchMetadataRepository::new(db.connection().clone());
1356        let json = branch_meta
1357            .get(&branch.0, "workspace_crates")
1358            .expect("branch_metadata query must succeed")
1359            .expect("workspace_crates key must be present for the scanned branch");
1360
1361        let names: Vec<String> =
1362            serde_json::from_str(&json).expect("workspace_crates must be valid JSON array");
1363
1364        // auto-detected crate name (normalised hyphens → underscores)
1365        assert!(
1366            names.contains(&"auto_detected_crate".to_owned()),
1367            "auto-detected crate must be present; got {:?}",
1368            names
1369        );
1370        // local_packages extra entry — normalised to underscores
1371        assert!(
1372            names.contains(&"extra_package".to_owned()),
1373            "extra_package (normalised) from local_packages must be present; got {:?}",
1374            names
1375        );
1376        // Must not have duplicates
1377        let unique: std::collections::HashSet<_> = names.iter().collect();
1378        assert_eq!(
1379            unique.len(),
1380            names.len(),
1381            "workspace_crates must not contain duplicates; got {:?}",
1382            names
1383        );
1384
1385        // Regression guard: the global repo_metadata slot must NOT be
1386        // written by the scanner anymore. Anything still reading it would be
1387        // a stale code path.
1388        let repo_meta = seshat_storage::SqliteRepoMetadataRepository::new(db.connection().clone());
1389        assert!(
1390            repo_meta
1391                .get("workspace_crates")
1392                .expect("repo_metadata query must succeed")
1393                .is_none(),
1394            "repo_metadata['workspace_crates'] must not be written by the scanner anymore",
1395        );
1396
1397        // And exactly one branch_metadata row for the scanned branch.
1398        let all = branch_meta
1399            .list(&branch.0)
1400            .expect("list branch_metadata must succeed");
1401        assert_eq!(
1402            all.len(),
1403            1,
1404            "exactly one branch_metadata row expected after a single scan; got {:?}",
1405            all
1406        );
1407    }
1408
1409    #[test]
1410    fn scan_two_branches_isolates_workspace_crates() {
1411        // Regression test: scanning two branches with different Cargo.toml
1412        // manifests must produce two independent branch_metadata rows —
1413        // neither overwrites the other, and the cross-branch reads see only
1414        // their own workspace_crates list.
1415        //
1416        // Each branch gets its own root directory so the manifests can
1417        // legitimately differ (the scanner is keyed by branch_id but reads
1418        // the on-disk manifest at scan time — we mirror what a real
1419        // worktree-per-branch checkout would look like).
1420        let db = Database::open(":memory:").expect("open DB");
1421
1422        // ---- main: one crate "main_only_crate" --------------------------
1423        let main_dir = tempdir().expect("create main tempdir");
1424        let main_root = main_dir.path();
1425        fs::create_dir_all(main_root.join(".git")).unwrap();
1426        fs::write(
1427            main_root.join("Cargo.toml"),
1428            r#"[package]
1429name = "main-only-crate"
1430version = "0.1.0"
1431edition = "2021"
1432"#,
1433        )
1434        .unwrap();
1435        fs::create_dir_all(main_root.join("src")).unwrap();
1436        fs::write(main_root.join("src/lib.rs"), "pub fn m() {}\n").unwrap();
1437
1438        // ---- feature: a different crate "feature_only_crate" ------------
1439        let feature_dir = tempdir().expect("create feature tempdir");
1440        let feature_root = feature_dir.path();
1441        fs::create_dir_all(feature_root.join(".git")).unwrap();
1442        fs::write(
1443            feature_root.join("Cargo.toml"),
1444            r#"[package]
1445name = "feature-only-crate"
1446version = "0.1.0"
1447edition = "2021"
1448"#,
1449        )
1450        .unwrap();
1451        fs::create_dir_all(feature_root.join("src")).unwrap();
1452        fs::write(feature_root.join("src/lib.rs"), "pub fn f() {}\n").unwrap();
1453
1454        let config = ScanConfig::default();
1455        let main_branch = BranchId::from("main");
1456        let feature_branch = BranchId::from("feature");
1457
1458        scan_project(main_root, &config, &db, main_branch.clone()).expect("scan main");
1459        scan_project(feature_root, &config, &db, feature_branch.clone()).expect("scan feature");
1460
1461        let branch_meta = SqliteBranchMetadataRepository::new(db.connection().clone());
1462
1463        let main_json = branch_meta
1464            .get(&main_branch.0, "workspace_crates")
1465            .unwrap()
1466            .expect("workspace_crates must exist for main");
1467        let feature_json = branch_meta
1468            .get(&feature_branch.0, "workspace_crates")
1469            .unwrap()
1470            .expect("workspace_crates must exist for feature");
1471
1472        let main_names: Vec<String> = serde_json::from_str(&main_json).unwrap();
1473        let feature_names: Vec<String> = serde_json::from_str(&feature_json).unwrap();
1474
1475        // Each branch sees its own crate, and neither sees the other's.
1476        assert!(
1477            main_names.contains(&"main_only_crate".to_owned()),
1478            "main branch must see its own crate; got {:?}",
1479            main_names
1480        );
1481        assert!(
1482            !main_names.contains(&"feature_only_crate".to_owned()),
1483            "main branch must not see feature's crate; got {:?}",
1484            main_names
1485        );
1486        assert!(
1487            feature_names.contains(&"feature_only_crate".to_owned()),
1488            "feature branch must see its own crate; got {:?}",
1489            feature_names
1490        );
1491        assert!(
1492            !feature_names.contains(&"main_only_crate".to_owned()),
1493            "feature branch must not see main's crate; got {:?}",
1494            feature_names
1495        );
1496
1497        // Re-scanning the same branch must UPSERT — still exactly one row
1498        // for that branch, and the other branch's row must survive untouched.
1499        scan_project(main_root, &config, &db, main_branch.clone()).expect("re-scan main");
1500        let main_rows = branch_meta.list(&main_branch.0).unwrap();
1501        assert_eq!(main_rows.len(), 1, "main must UPSERT, not duplicate");
1502        let feature_after = branch_meta
1503            .get(&feature_branch.0, "workspace_crates")
1504            .unwrap()
1505            .expect("feature row must survive a re-scan on main");
1506        assert_eq!(
1507            feature_after, feature_json,
1508            "re-scanning main must not mutate feature's workspace_crates",
1509        );
1510    }
1511
1512    #[test]
1513    fn incremental_scan_changed_paths_contains_only_modified_files() {
1514        let dir = create_test_project();
1515        let root = dir.path();
1516        let db = Database::open(":memory:").expect("open DB");
1517        let config = ScanConfig::default();
1518
1519        scan_project(root, &config, &db, BranchId::from("main")).expect("first scan");
1520
1521        // Modify exactly one file.
1522        let changed_file_abs = root.join("src/config.rs");
1523        fs::write(&changed_file_abs, "pub struct Config { pub extra: bool }\n").unwrap();
1524        // After Bug #3, paths stored in source_map / changed_paths are
1525        // relative to the scan root.
1526        let changed_file = std::path::PathBuf::from("src/config.rs");
1527
1528        let r2 = scan_project(root, &config, &db, BranchId::from("main")).expect("second scan");
1529
1530        // source_map must still contain ALL files.
1531        assert_eq!(
1532            r2.source_map.len(),
1533            r2.files_discovered,
1534            "source_map must contain all files even on incremental scan"
1535        );
1536
1537        // changed_paths must contain only the modified file.
1538        assert_eq!(
1539            r2.changed_paths.len(),
1540            1,
1541            "changed_paths must contain exactly 1 file (the modified one), got: {:?}",
1542            r2.changed_paths
1543        );
1544        assert!(
1545            r2.changed_paths.contains(&changed_file),
1546            "changed_paths must contain the modified file {:?}, got: {:?}",
1547            changed_file,
1548            r2.changed_paths
1549        );
1550
1551        // Unchanged files must be in source_map but NOT in changed_paths.
1552        for path in r2.source_map.keys() {
1553            if path != &changed_file {
1554                assert!(
1555                    !r2.changed_paths.contains(path),
1556                    "unchanged file {:?} must not be in changed_paths",
1557                    path
1558                );
1559            }
1560        }
1561    }
1562}