Skip to main content

seshat_cli/
scan.rs

1//! Implementation of the `seshat scan <path>` command.
2//!
3//! Runs the full scan pipeline: discovery -> parse -> detect -> aggregate -> store,
4//! with uniform spinner-based progress display for all phases.
5
6use std::path::Path;
7use std::time::Instant;
8
9use indicatif::{ProgressBar, ProgressStyle};
10use seshat_core::{BranchId, DetectionConfig};
11use seshat_detectors::{aggregate_findings, run_all_detectors};
12use seshat_scanner::{
13    ScanProgress, ScanResult, detect_submodule_paths, scan_project_with_progress,
14};
15use seshat_storage::{
16    Database, EmbeddingInput, EmbeddingRepository, RepoMetadataRepository,
17    SqliteEmbeddingRepository, SqliteRepoMetadataRepository, SqliteSubmoduleRepository,
18    StaleIrWipeReport, SubmoduleInput, SubmoduleRepository, wipe_stale_ir_cache,
19};
20
21use crate::config::AppConfig;
22use crate::db::unix_now;
23use crate::error::CliError;
24use crate::format::{self, Verbosity};
25
26/// Run the scan command on the given project directory.
27///
28/// # Pipeline
29///
30/// 1. Validate path
31/// 2. Load config from `seshat.toml` (or defaults)
32/// 3. Open database in XDG data directory
33/// 4. Run scan pipeline with progress reporting
34/// 5. Run convention detectors
35/// 6. Aggregate findings
36/// 7. Print report (verbosity-aware)
37pub fn run_scan(
38    path: &Path,
39    verbose: bool,
40    quiet: bool,
41    exclude_submodules: bool,
42) -> Result<(), CliError> {
43    let verbosity = Verbosity::from_flags(verbose, quiet);
44    let color = format::color_enabled();
45
46    // -- Validate path ------------------------------------------------
47    if !path.exists() {
48        return Err(CliError::InvalidPath {
49            path: path.display().to_string(),
50            reason: "path does not exist".to_owned(),
51        });
52    }
53    if !path.is_dir() {
54        return Err(CliError::InvalidPath {
55            path: path.display().to_string(),
56            reason: "path is not a directory".to_owned(),
57        });
58    }
59
60    // Resolve the project through the SHARED resolver: walks up to the git
61    // common-dir parent so all worktrees of a single repo land in one DB.
62    // For non-git directories the resolver canonicalises the input and uses
63    // its basename, matching the legacy scan-from-cwd behaviour.
64    let resolved = crate::db::resolve_project(Some(path), "scan")?;
65    let root = resolved.project_root.clone();
66    let db_path = resolved.db_path.clone();
67    let project_name = resolved.project_name.clone();
68
69    // -- Version header -----------------------------------------------
70    if verbosity.show_warnings() {
71        eprintln!("seshat v{}", env!("CARGO_PKG_VERSION"));
72    }
73
74    // -- Load config --------------------------------------------------
75    let mut config =
76        AppConfig::load().map_err(|e| CliError::scan(format!("failed to load config: {e}")))?;
77
78    // CLI flag overrides config file value.
79    if exclude_submodules {
80        config.scan.exclude_submodules = true;
81    }
82
83    // -- Open database ------------------------------------------------
84    if let Some(parent) = db_path.parent() {
85        std::fs::create_dir_all(parent)
86            .map_err(|e| CliError::scan(format!("failed to create database directory: {e}")))?;
87    }
88    let db = Database::open(&db_path)
89        .map_err(|e| CliError::scan(format!("failed to open database: {e}")))?;
90
91    // -- Auto-recover from a stale IR cache ---------------------------
92    // When `IR_SCHEMA_VERSION` is bumped, every cached `files_ir` blob in an
93    // existing DB becomes undeserialisable. Without this, the upcoming
94    // `get_by_branch` in the scanner would hard-fail and force users to
95    // delete the DB by hand (including user-curated decisions). Wipe the
96    // stale cache up-front so the scan re-parses from source; the user-data
97    // tables are untouched.
98    let wipe = wipe_stale_ir_cache(&db)
99        .map_err(|e| CliError::scan(format!("failed to clear stale IR cache: {e}")))?;
100    report_ir_cache_wipe(&wipe, "root", verbosity.show_warnings());
101
102    // -- Detect submodules early (before root scan) --------------------
103    let submodule_paths = detect_submodule_paths(&root);
104
105    // Detect git branch for scan scoping.
106    let scan_branch = crate::db::get_current_branch(&root)
107        .map(seshat_core::BranchId::from)
108        .unwrap_or_else(|| {
109            tracing::debug!(root = %root.display(), "Could not detect git branch for scan root, defaulting to 'main'");
110            seshat_core::BranchId::from("main")
111        });
112
113    // -- Scan submodules first (each gets its own DB) -----------------
114    let start = Instant::now();
115
116    let show = verbosity.show_warnings();
117
118    // -- Submodule scan phase -----------------------------------------
119    // Track scanned submodules for updating the root DB's submodules table.
120    struct ScannedSubmodule {
121        mount_path: String,
122        name: String,
123        db_path: String,
124        commit_hash: Option<String>,
125    }
126
127    // Look up stored submodule records from the root DB for change detection.
128    let root_sub_repo_for_detect = SqliteSubmoduleRepository::new(db.connection().clone());
129
130    // Scan submodules in parallel using std::thread::scope.
131    // Each submodule gets its own thread, DB connection, and spinner line.
132    // The root scan runs after all submodule threads complete.
133    let scanned_submodules: Vec<ScannedSubmodule> = if !config.scan.exclude_submodules
134        && !submodule_paths.is_empty()
135    {
136        // Pre-filter submodules: detect, check initialization, run change detection.
137        // This is done on the main thread since it's fast (no scanning).
138        enum SubmoduleAction {
139            Skip(ScannedSubmodule),
140            Scan {
141                mount_path: String,
142                name: String,
143                submodule_abs: std::path::PathBuf,
144                commit_hash: Option<String>,
145            },
146        }
147
148        let mut actions: Vec<SubmoduleAction> = Vec::new();
149
150        for mount_path in &submodule_paths {
151            let submodule_abs = root.join(mount_path);
152            let name = mount_path
153                .rsplit('/')
154                .next()
155                .unwrap_or(mount_path)
156                .to_string();
157
158            // Emit SubmoduleDetected for each discovered submodule.
159            if show {
160                eprintln!("  \u{2139} Submodule detected: {mount_path}");
161            }
162
163            // Check if initialized (non-empty dir with .git).
164            if !submodule_abs.is_dir()
165                || (!submodule_abs.join(".git").exists() && !submodule_abs.join(".git").is_file())
166            {
167                if show {
168                    let reason = "not initialized (no .git)";
169                    eprintln!("  \u{2298} Submodule {name} skipped: {reason}");
170                }
171                continue;
172            }
173
174            // Get the current commit hash for the submodule.
175            let commit_hash = seshat_scanner::get_head_commit(&submodule_abs);
176
177            // -- Change detection: compare current hash with stored hash ------
178            let stored_record = root_sub_repo_for_detect
179                .find_by_path(mount_path)
180                .map_err(|e| {
181                    CliError::scan(format!("failed to look up submodule '{mount_path}': {e}"))
182                })?;
183
184            if let Some(ref stored) = stored_record {
185                // Both hashes must be Some and equal for an up-to-date match.
186                if let (Some(current_hash), Some(stored_hash)) = (&commit_hash, &stored.commit_hash)
187                {
188                    if current_hash == stored_hash {
189                        // Commit hash matches — check whether the IR schema
190                        // version in the existing DB is still current.
191                        // If it isn't (e.g. IR_SCHEMA_VERSION was bumped since
192                        // the last scan), we must re-scan even though the files
193                        // haven't changed, so that all rows are rewritten with
194                        // the new schema version and become visible to queries.
195                        //
196                        // Use stored.db_path (already the resolved path written
197                        // by the previous scan) to open the submodule DB.
198                        let sub_branch_for_check = crate::db::get_current_branch(&submodule_abs)
199                            .unwrap_or_else(|| {
200                                tracing::debug!(submodule = %submodule_abs.display(), "Could not detect branch for submodule, defaulting to 'main'");
201                                "main".to_owned()
202                            });
203                        let schema_ok =
204                            seshat_storage::Database::open(std::path::Path::new(&stored.db_path))
205                                .ok()
206                                .map(|sub_db| {
207                                    crate::db::submodule_ir_schema_is_current(
208                                        &sub_db,
209                                        &sub_branch_for_check,
210                                    )
211                                })
212                                .unwrap_or(false); // can't open DB → force rescan
213
214                        if schema_ok {
215                            // Submodule is fully up-to-date — skip the scan.
216                            if show {
217                                let short = if current_hash.len() >= 7 {
218                                    &current_hash[..7]
219                                } else {
220                                    current_hash
221                                };
222                                eprintln!("  \u{2713} Submodule {name} up-to-date ({short})");
223                            }
224
225                            actions.push(SubmoduleAction::Skip(ScannedSubmodule {
226                                mount_path: mount_path.clone(),
227                                name,
228                                db_path: stored.db_path.clone(),
229                                commit_hash,
230                            }));
231                            continue;
232                        }
233
234                        // Schema is stale — fall through to schedule a rescan.
235                        if show {
236                            eprintln!(
237                                "  \u{21bb} Submodule {name} IR schema outdated, re-scanning..."
238                            );
239                        }
240                    }
241                }
242            }
243
244            // Hash differs or submodule is new — schedule for parallel scan.
245            actions.push(SubmoduleAction::Scan {
246                mount_path: mount_path.clone(),
247                name,
248                submodule_abs,
249                commit_hash,
250            });
251        }
252
253        // Collect skipped submodules immediately, scan the rest in parallel.
254        let mut results: Vec<ScannedSubmodule> = Vec::new();
255        let mut to_scan: Vec<(String, String, std::path::PathBuf, Option<String>)> = Vec::new();
256
257        for action in actions {
258            match action {
259                SubmoduleAction::Skip(sub) => results.push(sub),
260                SubmoduleAction::Scan {
261                    mount_path,
262                    name,
263                    submodule_abs,
264                    commit_hash,
265                } => to_scan.push((mount_path, name, submodule_abs, commit_hash)),
266            }
267        }
268
269        if !to_scan.is_empty() {
270            // References shared across threads (read-only or thread-safe).
271            let scan_config = &config.scan;
272            let detection_config = &config.detection;
273            let project_name_ref = &project_name;
274
275            // Parallel scan via std::thread::scope — all threads join before scope exits.
276            let parallel_results: Vec<Result<ScannedSubmodule, CliError>> = std::thread::scope(
277                |scope| {
278                    let handles: Vec<_> = to_scan
279                        .iter()
280                        .map(|(mount_path, name, submodule_abs, commit_hash)| {
281                            let sp =
282                                make_manual_spinner(&format!("{name}: discovering files..."), show);
283
284                            scope.spawn(move || -> Result<ScannedSubmodule, CliError> {
285                                // Each thread opens its own DB connection.
286                                let sub_db_path = crate::db::resolve_submodule_db_path(
287                                    project_name_ref,
288                                    mount_path,
289                                )?;
290                                let sub_db = Database::open(&sub_db_path).map_err(|e| {
291                                    CliError::scan(format!(
292                                        "failed to open submodule database for '{mount_path}': {e}"
293                                    ))
294                                })?;
295
296                                // Mirror the root-scan auto-recovery: a stale IR
297                                // cache in a submodule DB would otherwise crash
298                                // the same way at `get_by_branch` time.
299                                let sub_wipe = wipe_stale_ir_cache(&sub_db).map_err(|e| {
300                                    CliError::scan(format!(
301                                        "failed to clear stale IR cache for submodule '{mount_path}': {e}"
302                                    ))
303                                })?;
304                                report_ir_cache_wipe(&sub_wipe, name, show);
305
306                                // Detect branch from the submodule's git repo.
307                                let sub_branch = crate::db::get_current_branch(submodule_abs)
308                                    .map(seshat_core::BranchId::from)
309                                    .unwrap_or_else(|| {
310                                        tracing::debug!(submodule = %submodule_abs.display(), "Could not detect branch for submodule scan, defaulting to 'main'");
311                                        seshat_core::BranchId::from("main")
312                                    });
313
314                                // Run the full scan pipeline, updating the spinner
315                                // with phase info so the user sees progress.
316                                let scan_result = scan_project_with_progress(
317                                    submodule_abs,
318                                    scan_config,
319                                    &sub_db,
320                                    |event| {
321                                        match event {
322                                            ScanProgress::Discovering { count } => {
323                                                sp.set_message(format!(
324                                                    "{name}: discovering files... {count} found"
325                                                ));
326                                            }
327                                            ScanProgress::DiscoveryDone { total } => {
328                                                sp.set_message(format!(
329                                                    "{name}: discovering files... {total} found"
330                                                ));
331                                            }
332                                            ScanProgress::CollectingGitHistory => {
333                                                sp.set_message(format!(
334                                                    "{name}: collecting git history..."
335                                                ));
336                                            }
337                                            ScanProgress::Scanning { done, total } => {
338                                                sp.set_message(format!(
339                                                    "{name}: scanning files... {done}/{total}"
340                                                ));
341                                            }
342                                            ScanProgress::BuildingModuleGraph => {
343                                                sp.set_message(format!(
344                                                    "{name}: building module graph..."
345                                                ));
346                                            }
347                                            ScanProgress::AnalyzingProjectFiles => {
348                                                sp.set_message(format!(
349                                                    "{name}: analyzing manifests & docs..."
350                                                ));
351                                            }
352                                            _ => {}
353                                        }
354                                        sp.tick();
355                                    },
356                                    sub_branch.clone(),
357                                )
358                                .map_err(|e| {
359                                    CliError::scan(format!(
360                                        "submodule scan failed for '{mount_path}': {e}"
361                                    ))
362                                })?;
363
364                                sp.set_message(format!("{name}: analyzing conventions..."));
365                                sp.tick();
366
367                                let report = detect_and_persist(
368                                    &sub_db,
369                                    &sub_branch,
370                                    &detection_config.clone(),
371                                    &scan_result,
372                                )?;
373
374                                // Write repo_metadata to submodule DB.
375                                let meta =
376                                    SqliteRepoMetadataRepository::new(sub_db.connection().clone());
377                                write_metadata(
378                                    &meta,
379                                    &[
380                                        ("parent_project", project_name_ref),
381                                        ("mount_path", mount_path),
382                                        ("file_count", &report.file_count.to_string()),
383                                        ("convention_count", &report.convention_count.to_string()),
384                                        ("last_scan_time", &unix_now().to_string()),
385                                    ],
386                                )?;
387
388                                // Sentinel write moved into the scanner
389                                // orchestrator (P19) so every scan path
390                                // records last_scanned_commit automatically
391                                // — no per-caller wiring required here.
392
393                                sp.finish_with_message(format!(
394                                    "{name}: done ({} files, {} conventions)",
395                                    report.file_count, report.convention_count,
396                                ));
397
398                                Ok(ScannedSubmodule {
399                                    mount_path: mount_path.clone(),
400                                    name: name.clone(),
401                                    db_path: sub_db_path.to_string_lossy().to_string(),
402                                    commit_hash: commit_hash.clone(),
403                                })
404                            })
405                        })
406                        .collect();
407
408                    // Collect results from all threads.
409                    handles
410                        .into_iter()
411                        .map(|h| h.join().expect("submodule scan thread panicked"))
412                        .collect()
413                },
414            );
415
416            // Propagate any errors from parallel scans.
417            for result in parallel_results {
418                results.push(result?);
419            }
420        }
421
422        results
423    } else {
424        Vec::new()
425    };
426
427    // -- Run root scan with progress ----------------------------------
428    // Root scan is sequential (all submodules are done), so plain spinners
429    // are fine — no MultiProgress needed.
430    let discovery_sp = make_spinner("Discovering files...", show);
431
432    let git_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
433    let scan_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
434    let graph_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
435    let project_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
436
437    let scan_result = scan_project_with_progress(
438        &root,
439        &config.scan,
440        &db,
441        |event| match event {
442            ScanProgress::Discovering { count } => {
443                discovery_sp.set_message(format!("Discovering files... {count} found"));
444            }
445            ScanProgress::DiscoveryDone { total } => {
446                discovery_sp.finish_with_message(format!("Discovering files... {total} found"));
447            }
448            ScanProgress::CollectingGitHistory => {
449                *git_sp.borrow_mut() = Some(make_spinner("Collecting git history...", show));
450            }
451            ScanProgress::GitHistoryDone => {
452                if let Some(ref sp) = *git_sp.borrow() {
453                    sp.finish_with_message("Collecting git history... done");
454                }
455            }
456            ScanProgress::Scanning { done, total } => {
457                let mut sp_opt = scan_sp.borrow_mut();
458                if sp_opt.is_none() {
459                    *sp_opt = Some(make_spinner(&format!("Scanning files... 0/{total}"), show));
460                }
461                if let Some(ref sp) = *sp_opt {
462                    sp.set_message(format!("Scanning files... {done}/{total}"));
463                }
464            }
465            ScanProgress::ScanningDone => {
466                if let Some(ref sp) = *scan_sp.borrow() {
467                    sp.finish_with_message(sp.message().to_string());
468                }
469            }
470            ScanProgress::BuildingModuleGraph => {
471                *graph_sp.borrow_mut() = Some(make_spinner("Building module graph...", show));
472            }
473            ScanProgress::ModuleGraphDone => {
474                if let Some(ref sp) = *graph_sp.borrow() {
475                    sp.finish_with_message("Building module graph... done");
476                }
477            }
478            ScanProgress::AnalyzingProjectFiles => {
479                *project_sp.borrow_mut() =
480                    Some(make_spinner("Analyzing manifests & docs...", show));
481            }
482            ScanProgress::ProjectFilesDone => {
483                if let Some(ref sp) = *project_sp.borrow() {
484                    sp.finish_with_message("Analyzing manifests & docs... done");
485                }
486            }
487
488            // Submodule progress events are not emitted by the root orchestrator
489            // (submodules are scanned in a separate phase above), but the enum
490            // is exhaustive so we need a catch-all.
491            _ => {}
492        },
493        scan_branch.clone(),
494    )
495    .map_err(CliError::scan)?;
496
497    // -- Run convention detection + persistence on root ----------------
498    let detection_config = config.detection.clone();
499
500    let detect_sp = make_spinner("Analyzing conventions...", show);
501    let all_files = {
502        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
503        SqliteFileIRRepository::new(db.connection().clone())
504            .get_by_branch(&scan_branch)
505            .map_err(|e| CliError::scan(format!("failed to load files for detection: {e}")))?
506    };
507
508    // scan_result.source_map now contains source for ALL files (unchanged and
509    // changed alike) — the orchestrator keeps source in memory for every file
510    // it reads, not just the ones it re-parses.  So we can pass it directly
511    // to run_all_detectors and every file will go through detect_with_source,
512    // producing real snippets in convention evidence.
513    let file_count = all_files.len();
514    detect_sp.set_message(format!("Analyzing conventions... 0/{file_count}"));
515    let progress_cb = |done: usize, _total: usize| {
516        detect_sp.set_message(format!("Analyzing conventions... {done}/{file_count}"));
517    };
518    let project_context = seshat_detectors::ProjectContext::from_files(&all_files);
519    let detector_results = run_all_detectors(
520        &all_files,
521        &scan_result.source_map,
522        &detection_config,
523        &project_context,
524        Some(&progress_cb),
525    );
526    detect_sp.finish_with_message(format!(
527        "Analyzing conventions... {file_count}/{file_count}"
528    ));
529
530    let all_findings: Vec<seshat_core::ConventionFinding> = detector_results
531        .into_iter()
532        .flat_map(|dr| dr.findings)
533        .collect();
534
535    let file_dates_map: std::collections::HashMap<String, Option<i64>> = all_files
536        .iter()
537        .map(|f| {
538            let date = scan_result.file_dates.get(f.path.as_path()).copied();
539            (f.path.to_string_lossy().to_string(), date)
540        })
541        .collect();
542
543    let aggregated = aggregate_findings(
544        &all_findings,
545        &detection_config,
546        &file_dates_map,
547        unix_now(),
548    );
549
550    seshat_graph::persist_and_index(db.connection(), &scan_branch, &aggregated, &all_findings)
551        .map_err(|e| CliError::scan(format!("persist conventions: {e}")))?;
552
553    // -- Generate embeddings (optional) --------------------------------
554    // Pass changed_paths (not the full source_map) so that only new/changed
555    // files get re-embedded.  Unchanged files already have current embeddings
556    // in the DB and don't need to consume embedding API quota.
557    if let Some(ref embedding_config) = config.embedding {
558        generate_embeddings(
559            &db,
560            embedding_config,
561            &all_files,
562            &scan_result.source_map,
563            &scan_result.changed_paths,
564            &scan_branch.0,
565            show,
566        )?;
567    }
568
569    // -- Update root DB with submodule info + repo_metadata -----------
570    let root_sub_repo = SqliteSubmoduleRepository::new(db.connection().clone());
571
572    for sub in &scanned_submodules {
573        root_sub_repo
574            .upsert(&SubmoduleInput {
575                relative_path: sub.mount_path.clone(),
576                name: sub.name.clone(),
577                db_path: sub.db_path.clone(),
578                commit_hash: sub.commit_hash.clone(),
579            })
580            .map_err(|e| {
581                CliError::scan(format!(
582                    "failed to upsert submodule '{}' in root DB: {e}",
583                    sub.mount_path
584                ))
585            })?;
586    }
587
588    // Remove submodules from the root DB that are no longer in .gitmodules.
589    if let Ok(stored_submodules) = root_sub_repo.list() {
590        let active_paths: std::collections::HashSet<&str> =
591            submodule_paths.iter().map(|s| s.as_str()).collect();
592        for stored in &stored_submodules {
593            if !active_paths.contains(stored.relative_path.as_str()) {
594                let _ = root_sub_repo.delete(&stored.relative_path);
595            }
596        }
597    }
598
599    // Write repo_metadata to root DB.
600    let root_meta = SqliteRepoMetadataRepository::new(db.connection().clone());
601    write_metadata(
602        &root_meta,
603        &[
604            ("project_name", &project_name),
605            ("project_root", path.to_string_lossy().as_ref()),
606            ("file_count", &file_count.to_string()),
607            ("convention_count", &aggregated.len().to_string()),
608            ("last_scan_time", &unix_now().to_string()),
609        ],
610    )?;
611
612    // Sentinel write moved into the scanner orchestrator (P19); every
613    // scan path records last_scanned_commit automatically.
614
615    let elapsed = start.elapsed();
616
617    // -- Build report data and print ----------------------------------
618    let report_data = crate::report::build_report_data(
619        &scan_result,
620        &all_files,
621        aggregated,
622        &db_path,
623        elapsed,
624        config.scan.exclude_submodules,
625    );
626    crate::report::print_report(&report_data, verbosity, color);
627
628    Ok(())
629}
630
631/// Shared spinner style for the standard braille animation.
632fn spinner_style() -> ProgressStyle {
633    ProgressStyle::with_template("  {spinner:.cyan} {msg}")
634        .expect("valid template")
635        .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏", "✓"])
636}
637
638/// Create a spinner with automatic steady tick (80ms).
639///
640/// Use for main-thread spinners (root scan phases) where a background
641/// tick thread is safe and keeps the animation smooth.
642/// If `visible` is `false`, the spinner draws to a hidden target (silent mode).
643fn make_spinner(msg: &str, visible: bool) -> ProgressBar {
644    let sp = ProgressBar::new_spinner();
645    if visible {
646        sp.set_style(spinner_style());
647        sp.set_message(msg.to_owned());
648        sp.enable_steady_tick(std::time::Duration::from_millis(80));
649    } else {
650        sp.set_draw_target(indicatif::ProgressDrawTarget::hidden());
651    }
652    sp
653}
654
655/// Create a spinner driven manually via `tick()` + `set_message()`.
656///
657/// Use for worker-thread spinners (submodule scans) where the caller
658/// drives updates from progress callbacks. No background tick thread —
659/// avoids cursor-position races between the tick thread and the worker.
660fn make_manual_spinner(msg: &str, visible: bool) -> ProgressBar {
661    let sp = ProgressBar::new_spinner();
662    if visible {
663        sp.set_style(spinner_style());
664        sp.set_message(msg.to_owned());
665        sp.tick(); // draw initial frame
666    } else {
667        sp.set_draw_target(indicatif::ProgressDrawTarget::hidden());
668    }
669    sp
670}
671
672/// Log a stale-IR-cache wipe (no-op when nothing was cleared).
673///
674/// Emits a structured `tracing::warn` for log consumers plus a single
675/// user-facing line on stderr when `visible` is true. `scope` is a short
676/// label distinguishing the root DB from a submodule (e.g. `"root"` or the
677/// submodule's display name) so users with multiple submodules can tell
678/// which DB needed recovery.
679fn report_ir_cache_wipe(report: &StaleIrWipeReport, scope: &str, visible: bool) {
680    if report.is_empty() {
681        return;
682    }
683
684    // Format cached versions as "[7]" or "[5, 7]" so the log line stays
685    // readable when a DB accumulated rows from several upgrades.
686    let versions = report
687        .cached_versions
688        .iter()
689        .map(u8::to_string)
690        .collect::<Vec<_>>()
691        .join(", ");
692    let current = seshat_storage::IR_SCHEMA_VERSION;
693
694    tracing::warn!(
695        scope = scope,
696        stale_count = report.stale_count,
697        cached_versions = versions,
698        current_version = current,
699        symbol_definitions_cleared = report.symbol_definitions_cleared,
700        symbol_imports_cleared = report.symbol_imports_cleared,
701        "IR cache schema mismatch — wiped stale rows, scan will re-parse from source",
702    );
703
704    if visible {
705        eprintln!(
706            "  \u{21bb} IR cache schema mismatch ({scope}): cached v[{versions}] != current v{current}, \
707             cleared {n} stale IR rows — re-parsing from scratch",
708            n = report.stale_count,
709        );
710    }
711}
712
713// ── Shared scan pipeline helpers ─────────────────────────────
714
715/// Result of [`detect_and_persist`] — counts for metadata writes.
716#[derive(Debug)]
717struct DetectionReport {
718    file_count: usize,
719    convention_count: usize,
720}
721
722/// Run convention detection, aggregation, and persistence on an already-scanned DB.
723///
724/// Delegates to [`seshat_graph::run_detection_cycle`] — the single authoritative
725/// implementation shared with the warm-tier watcher.
726fn detect_and_persist(
727    db: &Database,
728    scan_branch: &BranchId,
729    detection_config: &DetectionConfig,
730    scan_result: &ScanResult,
731) -> Result<DetectionReport, CliError> {
732    // Build file-date map from the scan result so trend computation has git dates.
733    let file_dates_map: std::collections::HashMap<String, Option<i64>> = scan_result
734        .file_dates
735        .iter()
736        .map(|(p, &ts)| (p.to_string_lossy().to_string(), Some(ts)))
737        .collect();
738
739    let report = seshat_graph::run_detection_cycle(
740        db.connection(),
741        scan_branch,
742        detection_config,
743        &file_dates_map,
744        &scan_result.source_map,
745    )
746    .map_err(|e| CliError::scan(format!("detection pipeline failed: {e}")))?;
747
748    Ok(DetectionReport {
749        file_count: report.file_count,
750        convention_count: report.convention_count,
751    })
752}
753
754/// Write multiple key-value pairs to a [`SqliteRepoMetadataRepository`].
755fn write_metadata(
756    repo: &SqliteRepoMetadataRepository,
757    pairs: &[(&str, &str)],
758) -> Result<(), CliError> {
759    for (key, value) in pairs {
760        repo.set(key, value)
761            .map_err(|e| CliError::scan(format!("failed to write metadata '{key}': {e}")))?;
762    }
763    Ok(())
764}
765
766/// Generate embeddings for all code items (functions, types, exports) in the project.
767///
768/// When an embedding provider is configured, this function:
769/// 1. Creates the provider from config
770/// 2. Collects all (function, type, export) items from all parsed files
771/// 3. Batches texts and calls the provider
772/// 4. Stores embeddings in the `code_embeddings` table
773///
774/// On failure (e.g., provider timeout, connection error), logs a warning and
775/// continues — embedding is optional and should never break the scan pipeline.
776fn generate_embeddings(
777    db: &Database,
778    embedding_config: &seshat_embedding::EmbeddingConfig,
779    all_files: &[seshat_core::ProjectFile],
780    source_map: &std::collections::HashMap<std::path::PathBuf, String>,
781    changed_paths: &std::collections::HashSet<std::path::PathBuf>,
782    branch_id: &str,
783    show: bool,
784) -> Result<(), CliError> {
785    let provider = match seshat_embedding::create_provider(embedding_config) {
786        Ok(p) => p,
787        Err(e) => {
788            tracing::warn!("Failed to create embedding provider: {e}");
789            if show {
790                eprintln!("  \u{26a0} Embedding provider unavailable: {e}");
791            }
792            return Ok(());
793        }
794    };
795
796    // Collect items to embed: (file_path, item_name, item_kind, text_to_embed)
797    let mut items: Vec<(String, String, String, String)> = Vec::new();
798    for file in all_files {
799        // Skip files that haven't changed — their embeddings are already
800        // current in the DB from the previous scan.  Only new/changed files
801        // (tracked in changed_paths) need fresh embeddings.
802        if !changed_paths.contains(&file.path) {
803            continue;
804        }
805        // Source is always present in source_map for changed files.
806        let source = match source_map.get(&file.path) {
807            Some(s) => s,
808            None => continue,
809        };
810
811        let file_path = file.path.to_string_lossy().to_string();
812
813        // Use source already in memory — no disk read needed.
814        let source_lines: Option<Vec<String>> = Some(source.lines().map(str::to_owned).collect());
815
816        // Build import context string: module names imported in this file.
817        // Filter empty module names (e.g. side-effect imports like `import './foo'`).
818        // Cap at 20 modules to avoid consuming the model's token budget with boilerplate.
819        let import_context = {
820            let modules: Vec<&str> = file
821                .imports
822                .iter()
823                .map(|i| i.module.as_str())
824                .filter(|m| !m.is_empty())
825                .take(20)
826                .collect();
827            if modules.is_empty() {
828                String::new()
829            } else {
830                format!("\nuses: {}", modules.join(", "))
831            }
832        };
833
834        for func in &file.functions {
835            let vis = if func.is_public { "pub " } else { "" };
836            let asyncness = if func.is_async { "async " } else { "" };
837            let params = func.parameters.join(", ");
838            let body_snippet =
839                extract_body_snippet(source_lines.as_deref(), func.line, func.end_line);
840            let text = format!(
841                "{vis}{asyncness}fn {}({params}) in {file_path}{body_snippet}{import_context}",
842                func.name
843            );
844            items.push((
845                file_path.clone(),
846                func.name.clone(),
847                "function".to_string(),
848                text,
849            ));
850        }
851        for ty in &file.types {
852            let vis = if ty.is_public { "pub " } else { "" };
853            // Use explicit match instead of Debug format to get human-readable labels
854            // (e.g. "type_alias" not "TypeAlias", "class" not "Class").
855            let kind = match ty.kind {
856                seshat_core::TypeDefKind::Struct => "struct",
857                seshat_core::TypeDefKind::Enum => "enum",
858                seshat_core::TypeDefKind::Trait => "trait",
859                seshat_core::TypeDefKind::Interface => "interface",
860                seshat_core::TypeDefKind::Class => "class",
861                seshat_core::TypeDefKind::TypeAlias => "type_alias",
862            };
863            let text = format!("{vis}{kind} {} in {file_path}{import_context}", ty.name);
864            items.push((file_path.clone(), ty.name.clone(), "type".to_string(), text));
865        }
866        for exp in &file.exports {
867            let default = if exp.is_default { "default " } else { "" };
868            let text = format!(
869                "export {default}{} in {file_path}{import_context}",
870                exp.name
871            );
872            items.push((
873                file_path.clone(),
874                exp.name.clone(),
875                "export".to_string(),
876                text,
877            ));
878        }
879    }
880
881    if items.is_empty() {
882        tracing::info!("No code items to embed");
883        return Ok(());
884    }
885
886    let total = items.len();
887    let batch_size = embedding_config.batch_size.max(1);
888    let embed_sp = make_spinner(&format!("Generating embeddings... 0/{total}"), show);
889
890    let conn = db.connection().clone();
891    let embedding_repo = SqliteEmbeddingRepository::new(conn);
892
893    // Build the set of all (file_path, item_name, item_kind) that SHOULD
894    // exist in the DB after this scan succeeds. This lets us diff against
895    // stored rows and prune embeddings from deleted/renamed files.
896    let mut current_keys: std::collections::HashSet<(String, String, String)> =
897        std::collections::HashSet::new();
898    for file in all_files {
899        let file_path = file.path.to_string_lossy().to_string();
900        for func in &file.functions {
901            current_keys.insert((file_path.clone(), func.name.clone(), "function".to_string()));
902        }
903        for ty in &file.types {
904            current_keys.insert((file_path.clone(), ty.name.clone(), "type".to_string()));
905        }
906        for exp in &file.exports {
907            current_keys.insert((file_path.clone(), exp.name.clone(), "export".to_string()));
908        }
909    }
910
911    // NOTE: We intentionally do NOT delete_by_branch here. If embedding
912    // generation fails mid-way (provider timeout, rate limit), we'd lose
913    // the previously complete embedding set with nothing to replace it.
914    // Instead we rely on upsert (ON CONFLICT DO UPDATE) and prune stale
915    // rows after a successful upsert by diffing current_keys against
916    // stored_keys — stale rows from deleted/renamed files are cleaned
917    // up without risking data loss.
918
919    let mut embedded_count: usize = 0;
920
921    let _embedding_outcome: Result<(), ()> = 'embed: {
922        for chunk in items.chunks(batch_size) {
923            let texts: Vec<String> = chunk.iter().map(|(_, _, _, text)| text.clone()).collect();
924
925            match provider.embed(&texts) {
926                Ok(embeddings) => {
927                    let inputs: Vec<EmbeddingInput> = chunk
928                        .iter()
929                        .zip(embeddings)
930                        .map(
931                            |((file_path, item_name, item_kind, _), emb)| EmbeddingInput {
932                                file_path: file_path.clone(),
933                                item_name: item_name.clone(),
934                                item_kind: item_kind.clone(),
935                                embedding: emb,
936                            },
937                        )
938                        .collect();
939
940                    if let Err(e) = embedding_repo.upsert_batch(branch_id, &inputs) {
941                        tracing::warn!("Failed to store embedding batch: {e}");
942                        embed_sp.finish_with_message(
943                            "Generating embeddings... failed (storage error)".to_string(),
944                        );
945                        break 'embed Err(());
946                    }
947
948                    embedded_count += chunk.len();
949                    embed_sp
950                        .set_message(format!("Generating embeddings... {embedded_count}/{total}"));
951                }
952                Err(e) => {
953                    tracing::warn!(
954                        embedded = embedded_count,
955                        total = total,
956                        remaining = total - embedded_count,
957                        "Embedding provider error mid-batch; {embedded_count}/{total} items stored, \
958                         {} items skipped. Database contains partial embeddings: {e}",
959                        total - embedded_count,
960                    );
961                    embed_sp.finish_with_message(format!(
962                        "Generating embeddings... failed ({embedded_count}/{total})"
963                    ));
964                    if show {
965                        eprintln!(
966                            "  \u{26a0} Embedding generation failed after {embedded_count}/{total} items \
967                             ({} skipped, partial state): {e}",
968                            total - embedded_count,
969                        );
970                    }
971                    break 'embed Err(());
972                }
973            }
974        }
975
976        embed_sp.finish_with_message(format!("Generating embeddings... {embedded_count}/{total}"));
977
978        tracing::info!(
979            count = embedded_count,
980            total = total,
981            "Generated code embeddings"
982        );
983
984        Ok(())
985    };
986
987    // Prune stale embedding rows from deleted/renamed files.
988    match embedding_repo.get_stored_keys(branch_id) {
989        Ok(stored_keys) => {
990            let stored_set: std::collections::HashSet<_> = stored_keys.into_iter().collect();
991            let stale: Vec<_> = stored_set.difference(&current_keys).cloned().collect();
992
993            if !stale.is_empty() {
994                match embedding_repo.delete_stale(branch_id, &stale) {
995                    Ok(pruned) => {
996                        tracing::info!(pruned = pruned, "Pruned {} stale embedding rows", pruned);
997                    }
998                    Err(e) => {
999                        tracing::warn!(
1000                            "Failed to prune stale embedding rows: {e} (will retry next scan)"
1001                        );
1002                    }
1003                }
1004            }
1005        }
1006        Err(e) => {
1007            tracing::warn!(
1008                "Failed to query stored embedding keys for stale cleanup: {e} (will retry next scan)"
1009            );
1010        }
1011    }
1012
1013    Ok(())
1014}
1015
1016/// Extract a body snippet from source lines for use in embedding text.
1017///
1018/// Returns the first `HEAD_LINES` lines and last `TAIL_LINES` lines of the
1019/// function body (1-indexed, inclusive). If the function is short enough to
1020/// fit in HEAD_LINES + TAIL_LINES, returns all lines without duplication.
1021///
1022/// Returns an empty string if source lines are not available or line range
1023/// is out of bounds.
1024fn extract_body_snippet(
1025    source_lines: Option<&[String]>,
1026    start_line: usize,
1027    end_line: usize,
1028) -> String {
1029    const HEAD_LINES: usize = 5;
1030    const TAIL_LINES: usize = 3;
1031
1032    let lines = match source_lines {
1033        Some(l) if !l.is_empty() && start_line > 0 => l,
1034        _ => return String::new(),
1035    };
1036
1037    // Convert to 0-indexed, clamp to available lines.
1038    let start = (start_line - 1).min(lines.len());
1039    let end = end_line.min(lines.len());
1040
1041    if start >= end {
1042        return String::new();
1043    }
1044
1045    let body = &lines[start..end];
1046
1047    // If the body fits within HEAD + TAIL lines (no gap between them), return all
1048    // lines — using ... only when there are lines that would be skipped.
1049    let snippet = if body.len() <= HEAD_LINES + TAIL_LINES {
1050        body.iter()
1051            .map(String::as_str)
1052            .collect::<Vec<_>>()
1053            .join("\n")
1054    } else {
1055        let head: Vec<&str> = body.iter().take(HEAD_LINES).map(String::as_str).collect();
1056        let tail: Vec<&str> = body
1057            .iter()
1058            .rev()
1059            .take(TAIL_LINES)
1060            .collect::<Vec<_>>()
1061            .into_iter()
1062            .rev()
1063            .map(String::as_str)
1064            .collect();
1065        format!("{}\n...\n{}", head.join("\n"), tail.join("\n"))
1066    };
1067
1068    format!("\n{}", snippet.trim())
1069}
1070
1071#[cfg(test)]
1072mod tests {
1073    use super::*;
1074    use seshat_scanner::scan_project;
1075    use seshat_storage::{
1076        Database, FileIRRepository, RepoMetadataRepository, SqliteFileIRRepository,
1077        SqliteRepoMetadataRepository, SqliteSubmoduleRepository, SubmoduleInput,
1078        SubmoduleRepository,
1079    };
1080    use std::fs;
1081    use tempfile::tempdir;
1082
1083    /// Helper: create a root project with a mock submodule directory.
1084    ///
1085    /// Layout:
1086    /// ```text
1087    /// root/
1088    ///   .git/
1089    ///   .gitmodules          (declares "frontend" submodule)
1090    ///   src/main.rs
1091    ///   frontend/
1092    ///     .git/              (marks it as an initialized submodule)
1093    ///     src/app.ts
1094    /// ```
1095    fn create_project_with_submodule() -> tempfile::TempDir {
1096        let dir = tempdir().expect("create tempdir");
1097        let root = dir.path();
1098
1099        // Root project
1100        fs::create_dir_all(root.join(".git")).unwrap();
1101        fs::create_dir_all(root.join("src")).unwrap();
1102        fs::write(
1103            root.join("src/main.rs"),
1104            "pub fn main() { println!(\"hello\"); }\n",
1105        )
1106        .unwrap();
1107
1108        // .gitmodules declaring the submodule
1109        fs::write(
1110            root.join(".gitmodules"),
1111            "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
1112        )
1113        .unwrap();
1114
1115        // Submodule directory (initialized with .git)
1116        fs::create_dir_all(root.join("frontend/.git")).unwrap();
1117        fs::create_dir_all(root.join("frontend/src")).unwrap();
1118        fs::write(
1119            root.join("frontend/src/app.ts"),
1120            "export function app(): string { return 'hello'; }\n",
1121        )
1122        .unwrap();
1123
1124        dir
1125    }
1126
1127    #[test]
1128    fn submodule_scan_creates_separate_dbs_with_correct_structure() {
1129        let dir = create_project_with_submodule();
1130        let root = dir.path();
1131        let config = seshat_core::ScanConfig::default();
1132
1133        // Create root DB and submodule DB (both in-memory for testing).
1134        let root_db = Database::open(":memory:").expect("open root DB");
1135        let sub_db = Database::open(":memory:").expect("open submodule DB");
1136
1137        // Scan root project (submodule dirs are excluded from root discovery).
1138        let root_result = scan_project(root, &config, &root_db, BranchId::from("main"))
1139            .expect("root scan should succeed");
1140        assert!(
1141            !root_result.excluded_submodules.is_empty(),
1142            "should detect submodule in .gitmodules"
1143        );
1144        assert_eq!(root_result.excluded_submodules, vec!["frontend"]);
1145
1146        // Root should only find main.rs (frontend is excluded).
1147        assert_eq!(
1148            root_result.files_discovered, 1,
1149            "root should discover 1 file (main.rs)"
1150        );
1151
1152        // Scan submodule directory into its own DB.
1153        let sub_root = root.join("frontend");
1154        let sub_result = scan_project(&sub_root, &config, &sub_db, BranchId::from("main"))
1155            .expect("submodule scan should succeed");
1156        assert_eq!(
1157            sub_result.files_discovered, 1,
1158            "submodule should discover 1 file (app.ts)"
1159        );
1160
1161        // Verify both DBs have IR records.
1162        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
1163        let branch = BranchId::from("main");
1164
1165        let root_files = SqliteFileIRRepository::new(root_db.connection().clone())
1166            .get_by_branch(&branch)
1167            .unwrap();
1168        assert_eq!(root_files.len(), 1, "root DB should have 1 file IR");
1169
1170        let sub_files = SqliteFileIRRepository::new(sub_db.connection().clone())
1171            .get_by_branch(&branch)
1172            .unwrap();
1173        assert_eq!(sub_files.len(), 1, "submodule DB should have 1 file IR");
1174
1175        // Write repo_metadata to submodule DB (as run_scan does).
1176        let sub_meta = SqliteRepoMetadataRepository::new(sub_db.connection().clone());
1177        sub_meta.set("parent_project", "my-project").unwrap();
1178        sub_meta.set("mount_path", "frontend").unwrap();
1179        sub_meta
1180            .set("file_count", &sub_result.files_discovered.to_string())
1181            .unwrap();
1182        sub_meta.set("convention_count", "0").unwrap();
1183        sub_meta.set("last_scan_time", "1700000000").unwrap();
1184
1185        assert_eq!(
1186            sub_meta.get("parent_project").unwrap().unwrap(),
1187            "my-project"
1188        );
1189        assert_eq!(sub_meta.get("mount_path").unwrap().unwrap(), "frontend");
1190        assert_eq!(sub_meta.get("file_count").unwrap().unwrap(), "1");
1191
1192        // Write submodule record to root DB (as run_scan does).
1193        let root_sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1194        root_sub_repo
1195            .insert(&SubmoduleInput {
1196                relative_path: "frontend".to_string(),
1197                name: "frontend".to_string(),
1198                db_path: "/data/seshat/repos/my-project/frontend.db".to_string(),
1199                commit_hash: None, // mock submodule has no real commits
1200            })
1201            .unwrap();
1202
1203        let stored = root_sub_repo.list().unwrap();
1204        assert_eq!(stored.len(), 1);
1205        assert_eq!(stored[0].relative_path, "frontend");
1206        assert_eq!(stored[0].name, "frontend");
1207
1208        // Write repo_metadata to root DB.
1209        let root_meta = SqliteRepoMetadataRepository::new(root_db.connection().clone());
1210        root_meta.set("project_name", "my-project").unwrap();
1211        root_meta
1212            .set("file_count", &root_result.files_discovered.to_string())
1213            .unwrap();
1214        root_meta.set("convention_count", "0").unwrap();
1215        root_meta.set("last_scan_time", "1700000000").unwrap();
1216
1217        assert_eq!(
1218            root_meta.get("project_name").unwrap().unwrap(),
1219            "my-project"
1220        );
1221        assert_eq!(root_meta.get("file_count").unwrap().unwrap(), "1");
1222    }
1223
1224    #[test]
1225    fn uninitialised_submodule_is_skipped() {
1226        let dir = tempdir().expect("create tempdir");
1227        let root = dir.path();
1228
1229        fs::create_dir_all(root.join(".git")).unwrap();
1230        fs::create_dir_all(root.join("src")).unwrap();
1231        fs::write(root.join("src/main.rs"), "pub fn main() {}\n").unwrap();
1232
1233        // .gitmodules declares a submodule that exists as a directory but has no .git
1234        fs::write(
1235            root.join(".gitmodules"),
1236            "[submodule \"libs/shared\"]\n\tpath = libs/shared\n\turl = https://example.com\n",
1237        )
1238        .unwrap();
1239        fs::create_dir_all(root.join("libs/shared")).unwrap();
1240        // No .git in libs/shared — it's not initialized
1241
1242        let config = seshat_core::ScanConfig::default();
1243        let db = Database::open(":memory:").expect("open DB");
1244
1245        let result =
1246            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
1247
1248        // Submodule dirs are always excluded from root discovery.
1249        assert_eq!(result.excluded_submodules, vec!["libs/shared"]);
1250        // Root only finds main.rs.
1251        assert_eq!(result.files_discovered, 1);
1252    }
1253
1254    #[test]
1255    fn submodule_removed_from_gitmodules_gets_deleted_from_table() {
1256        let root_db = Database::open(":memory:").expect("open DB");
1257        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1258
1259        // Simulate a previously scanned submodule in the table.
1260        sub_repo
1261            .insert(&SubmoduleInput {
1262                relative_path: "old-module".to_string(),
1263                name: "old-module".to_string(),
1264                db_path: "/data/repos/project/old-module.db".to_string(),
1265                commit_hash: Some("abc123".to_string()),
1266            })
1267            .unwrap();
1268
1269        // Current .gitmodules no longer includes "old-module".
1270        let active_paths: std::collections::HashSet<&str> = ["frontend"].iter().copied().collect();
1271
1272        let stored = sub_repo.list().unwrap();
1273        for stored_sub in &stored {
1274            if !active_paths.contains(stored_sub.relative_path.as_str()) {
1275                let _ = sub_repo.delete(&stored_sub.relative_path);
1276            }
1277        }
1278
1279        let remaining = sub_repo.list().unwrap();
1280        assert!(
1281            remaining.is_empty(),
1282            "old-module should have been removed from submodules table"
1283        );
1284    }
1285
1286    // -- US-005: Change detection unit tests --------------------------
1287
1288    /// Helper: determine if a submodule should be skipped based on stored vs current hash.
1289    /// Returns true if the scan should be skipped (hashes match).
1290    fn should_skip_submodule(stored_hash: Option<&str>, current_hash: Option<&str>) -> bool {
1291        match (current_hash, stored_hash) {
1292            (Some(current), Some(stored)) => current == stored,
1293            _ => false,
1294        }
1295    }
1296
1297    #[test]
1298    fn change_detection_skip_when_hashes_match() {
1299        // Both hashes are Some and equal → skip.
1300        assert!(should_skip_submodule(
1301            Some("abc123def456abc123def456abc123def456abc123"),
1302            Some("abc123def456abc123def456abc123def456abc123"),
1303        ));
1304    }
1305
1306    #[test]
1307    fn change_detection_rescan_when_hashes_differ() {
1308        // Both hashes are Some but different → rescan.
1309        assert!(!should_skip_submodule(
1310            Some("abc123def456abc123def456abc123def456abc123"),
1311            Some("000000def456abc123def456abc123def456abc123"),
1312        ));
1313    }
1314
1315    #[test]
1316    fn change_detection_rescan_when_no_stored_hash() {
1317        // Stored hash is None (first scan or no commits at previous scan) → rescan.
1318        assert!(!should_skip_submodule(
1319            None,
1320            Some("abc123def456abc123def456abc123def456abc123"),
1321        ));
1322    }
1323
1324    #[test]
1325    fn change_detection_rescan_when_no_current_hash() {
1326        // Current hash is None (submodule has no commits now) → rescan.
1327        assert!(!should_skip_submodule(
1328            Some("abc123def456abc123def456abc123def456abc123"),
1329            None,
1330        ));
1331    }
1332
1333    #[test]
1334    fn change_detection_rescan_when_both_hashes_none() {
1335        // Both hashes are None → rescan (can't confirm up-to-date).
1336        assert!(!should_skip_submodule(None, None));
1337    }
1338
1339    #[test]
1340    fn change_detection_new_submodule_triggers_full_scan() {
1341        // New submodule: not in the stored table at all → no stored record.
1342        let root_db = Database::open(":memory:").expect("open DB");
1343        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1344
1345        // Submodule "frontend" not in the table yet.
1346        let stored = sub_repo.find_by_path("frontend").unwrap();
1347        assert!(stored.is_none(), "new submodule should not be in table");
1348
1349        // Since there's no stored record, the change detection logic
1350        // will fall through to full scan (no match possible).
1351    }
1352
1353    #[test]
1354    fn change_detection_updated_hash_stored_after_rescan() {
1355        let root_db = Database::open(":memory:").expect("open DB");
1356        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1357
1358        // Insert a submodule with an old hash.
1359        let old_hash = "aaaa".repeat(10);
1360        sub_repo
1361            .insert(&SubmoduleInput {
1362                relative_path: "frontend".to_string(),
1363                name: "frontend".to_string(),
1364                db_path: "/data/repos/project/frontend.db".to_string(),
1365                commit_hash: Some(old_hash.clone()),
1366            })
1367            .unwrap();
1368
1369        // Simulate: current hash differs → rescan happened → update stored hash.
1370        let new_hash = "bbbb".repeat(10);
1371        sub_repo
1372            .update(&SubmoduleInput {
1373                relative_path: "frontend".to_string(),
1374                name: "frontend".to_string(),
1375                db_path: "/data/repos/project/frontend.db".to_string(),
1376                commit_hash: Some(new_hash.clone()),
1377            })
1378            .unwrap();
1379
1380        let stored = sub_repo.find_by_path("frontend").unwrap().unwrap();
1381        assert_eq!(
1382            stored.commit_hash.as_deref(),
1383            Some(new_hash.as_str()),
1384            "stored hash should be updated after rescan"
1385        );
1386
1387        // On the next scan, the hashes will match → skip.
1388        assert!(should_skip_submodule(
1389            stored.commit_hash.as_deref(),
1390            Some(&new_hash),
1391        ));
1392    }
1393
1394    #[test]
1395    fn change_detection_skipped_submodule_not_deleted_from_table() {
1396        let root_db = Database::open(":memory:").expect("open DB");
1397        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1398
1399        let hash = "abcd".repeat(10);
1400        sub_repo
1401            .insert(&SubmoduleInput {
1402                relative_path: "frontend".to_string(),
1403                name: "frontend".to_string(),
1404                db_path: "/data/repos/project/frontend.db".to_string(),
1405                commit_hash: Some(hash.clone()),
1406            })
1407            .unwrap();
1408
1409        // Simulate: submodule was skipped (up-to-date) but still tracked in
1410        // the scanned_submodules list, so cleanup won't delete it.
1411        let active_paths: std::collections::HashSet<&str> = ["frontend"].iter().copied().collect();
1412
1413        let stored = sub_repo.list().unwrap();
1414        for stored_sub in &stored {
1415            if !active_paths.contains(stored_sub.relative_path.as_str()) {
1416                let _ = sub_repo.delete(&stored_sub.relative_path);
1417            }
1418        }
1419
1420        let remaining = sub_repo.list().unwrap();
1421        assert_eq!(
1422            remaining.len(),
1423            1,
1424            "skipped submodule should remain in table"
1425        );
1426        assert_eq!(remaining[0].relative_path, "frontend");
1427    }
1428
1429    // ── extract_body_snippet tests ────────────────────────────────────────────
1430
1431    fn make_lines(n: usize) -> Vec<String> {
1432        (1..=n).map(|i| format!("line_{i}")).collect()
1433    }
1434
1435    #[test]
1436    fn body_snippet_none_source_returns_empty() {
1437        assert_eq!(extract_body_snippet(None, 1, 5), "");
1438    }
1439
1440    #[test]
1441    fn body_snippet_start_zero_returns_empty() {
1442        let lines = make_lines(10);
1443        // start_line=0 is invalid (IR lines are 1-indexed)
1444        assert_eq!(extract_body_snippet(Some(&lines), 0, 5), "");
1445    }
1446
1447    #[test]
1448    fn body_snippet_single_line_function() {
1449        let lines = make_lines(20);
1450        // Function at line 5, single line
1451        let result = extract_body_snippet(Some(&lines), 5, 5);
1452        assert!(!result.is_empty());
1453        assert!(result.contains("line_5"));
1454    }
1455
1456    #[test]
1457    fn body_snippet_short_function_returns_all_lines() {
1458        let lines = make_lines(20);
1459        // Function lines 3-7 (5 lines) — fits in HEAD (5) without truncation
1460        let result = extract_body_snippet(Some(&lines), 3, 7);
1461        assert!(result.contains("line_3"));
1462        assert!(result.contains("line_7"));
1463        assert!(!result.contains("...")); // no truncation marker
1464    }
1465
1466    #[test]
1467    fn body_snippet_long_function_has_head_and_tail() {
1468        let lines = make_lines(50);
1469        // Function lines 1-50 — should produce head...tail
1470        let result = extract_body_snippet(Some(&lines), 1, 50);
1471        assert!(result.contains("line_1")); // head
1472        assert!(result.contains("line_5")); // head last
1473        assert!(result.contains("...")); // truncation marker
1474        assert!(result.contains("line_50")); // tail last
1475        assert!(result.contains("line_48")); // tail first
1476        // middle lines should NOT appear
1477        assert!(!result.contains("line_25"));
1478    }
1479
1480    #[test]
1481    fn body_snippet_exactly_boundary_no_overlap() {
1482        let lines = make_lines(20);
1483        // HEAD_LINES=5 + TAIL_LINES=3 = 8. Function with exactly 8 lines
1484        // should NOT produce ... (fits entirely)
1485        let result = extract_body_snippet(Some(&lines), 1, 8);
1486        assert!(
1487            !result.contains("..."),
1488            "8-line function should not be truncated"
1489        );
1490        assert!(result.contains("line_1"));
1491        assert!(result.contains("line_8")); // all 8 lines present
1492    }
1493
1494    #[test]
1495    fn body_snippet_trim_applied() {
1496        let lines = vec![
1497            "  fn foo() {".to_owned(),
1498            "    let x = 1;".to_owned(),
1499            "  }".to_owned(),
1500        ];
1501        let result = extract_body_snippet(Some(&lines), 1, 3);
1502        // Should start with \n then trimmed content
1503        assert!(result.starts_with('\n'));
1504        assert!(!result.starts_with("\n  ")); // leading whitespace trimmed
1505    }
1506
1507    #[test]
1508    fn body_snippet_empty_lines_returns_empty() {
1509        let lines: Vec<String> = Vec::new();
1510        assert_eq!(extract_body_snippet(Some(&lines), 1, 5), "");
1511    }
1512
1513    #[test]
1514    fn body_snippet_start_after_end_returns_empty() {
1515        // start_line > end_line is invalid — early return.
1516        let lines = make_lines(20);
1517        assert_eq!(extract_body_snippet(Some(&lines), 10, 5), "");
1518    }
1519
1520    #[test]
1521    fn body_snippet_end_line_clamped_to_available() {
1522        // end_line beyond available lines must clamp, not panic.
1523        let lines = make_lines(5);
1524        let result = extract_body_snippet(Some(&lines), 1, 999);
1525        assert!(result.contains("line_1"));
1526        assert!(result.contains("line_5"));
1527    }
1528
1529    #[test]
1530    fn body_snippet_start_at_last_line_returns_single_line() {
1531        let lines = make_lines(5);
1532        // start_line=5 → start=4, end=5.min(5)=5 → body = lines[4..5]
1533        let result = extract_body_snippet(Some(&lines), 5, 5);
1534        assert!(result.contains("line_5"));
1535        assert!(!result.contains("line_4"));
1536    }
1537
1538    #[test]
1539    fn body_snippet_start_past_lines_returns_empty() {
1540        // start_line - 1 == lines.len() (clamp), so start == end → empty.
1541        let lines = make_lines(3);
1542        assert_eq!(extract_body_snippet(Some(&lines), 4, 4), "");
1543    }
1544
1545    #[test]
1546    fn body_snippet_long_body_skips_middle_lines() {
1547        // Body of 15 lines: HEAD=5, TAIL=3 → 7 middle lines must be omitted.
1548        let lines = make_lines(20);
1549        let result = extract_body_snippet(Some(&lines), 1, 15);
1550        assert!(result.contains("line_1"));
1551        assert!(result.contains("line_5")); // HEAD ends
1552        assert!(!result.contains("line_6")); // first omitted
1553        assert!(!result.contains("line_10")); // middle omitted
1554        assert!(result.contains("line_13")); // TAIL begins
1555        assert!(result.contains("line_15")); // TAIL ends
1556        assert!(result.contains("..."));
1557    }
1558
1559    // ── Branch-aware detect_and_persist tests ──────────────────────────────────
1560
1561    #[test]
1562    fn detect_and_persist_uses_branch_id_for_loading_files() {
1563        let db = Database::open(":memory:").expect("open DB");
1564        let feature_branch = BranchId::from("feat/my-feature");
1565
1566        use seshat_core::test_helpers::make_project_file;
1567        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
1568
1569        let file = make_project_file(seshat_core::Language::Rust);
1570        SqliteFileIRRepository::new(db.connection().clone())
1571            .upsert(&feature_branch, &file, None)
1572            .expect("upsert file under feature branch");
1573
1574        let scan_result = seshat_scanner::ScanResult {
1575            files_discovered: 1,
1576            files_parsed: 1,
1577            nodes_persisted: 0,
1578            edges_persisted: 0,
1579            manifests_analyzed: 0,
1580            docs_ingested: 0,
1581            manifest_analyses: vec![],
1582            incremental: None,
1583            file_dates: std::collections::HashMap::new(),
1584            excluded_submodules: vec![],
1585            source_map: std::collections::HashMap::new(),
1586            changed_paths: std::collections::HashSet::new(),
1587        };
1588
1589        let config = DetectionConfig::default();
1590        let result = detect_and_persist(&db, &feature_branch, &config, &scan_result);
1591        assert!(
1592            result.is_ok(),
1593            "detect_and_persist should succeed: {result:?}"
1594        );
1595        let report = result.unwrap();
1596        assert_eq!(
1597            report.file_count, 1,
1598            "should find the file stored under feature branch"
1599        );
1600    }
1601
1602    #[test]
1603    fn detect_and_persist_returns_zero_for_wrong_branch() {
1604        let db = Database::open(":memory:").expect("open DB");
1605        let feature_branch = BranchId::from("feat/my-feature");
1606        let main_branch = BranchId::from("main");
1607
1608        use seshat_core::test_helpers::make_project_file;
1609        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
1610
1611        let file = make_project_file(seshat_core::Language::Rust);
1612        SqliteFileIRRepository::new(db.connection().clone())
1613            .upsert(&feature_branch, &file, None)
1614            .expect("upsert file under feature branch");
1615
1616        let scan_result = seshat_scanner::ScanResult {
1617            files_discovered: 1,
1618            files_parsed: 1,
1619            nodes_persisted: 0,
1620            edges_persisted: 0,
1621            manifests_analyzed: 0,
1622            docs_ingested: 0,
1623            manifest_analyses: vec![],
1624            incremental: None,
1625            file_dates: std::collections::HashMap::new(),
1626            excluded_submodules: vec![],
1627            source_map: std::collections::HashMap::new(),
1628            changed_paths: std::collections::HashSet::new(),
1629        };
1630
1631        let config = DetectionConfig::default();
1632        let result = detect_and_persist(&db, &main_branch, &config, &scan_result);
1633        assert!(result.is_ok());
1634        let report = result.unwrap();
1635        assert_eq!(report.file_count, 0, "main branch should have no files");
1636    }
1637
1638    #[test]
1639    fn detect_and_persist_persists_conventions_under_correct_branch() {
1640        let db = Database::open(":memory:").expect("open DB");
1641        let feature_branch = BranchId::from("feat/snippets");
1642
1643        use seshat_core::test_helpers::make_project_file;
1644        use seshat_storage::{
1645            FileIRRepository, NodeRepository, SqliteFileIRRepository, SqliteNodeRepository,
1646        };
1647
1648        let file = make_project_file(seshat_core::Language::Rust);
1649        SqliteFileIRRepository::new(db.connection().clone())
1650            .upsert(&feature_branch, &file, None)
1651            .expect("upsert file under feature branch");
1652
1653        let scan_result = seshat_scanner::ScanResult {
1654            files_discovered: 1,
1655            files_parsed: 1,
1656            nodes_persisted: 0,
1657            edges_persisted: 0,
1658            manifests_analyzed: 0,
1659            docs_ingested: 0,
1660            manifest_analyses: vec![],
1661            incremental: None,
1662            file_dates: std::collections::HashMap::new(),
1663            excluded_submodules: vec![],
1664            source_map: std::collections::HashMap::new(),
1665            changed_paths: std::collections::HashSet::new(),
1666        };
1667
1668        let config = DetectionConfig::default();
1669        let result = detect_and_persist(&db, &feature_branch, &config, &scan_result);
1670        assert!(result.is_ok());
1671
1672        let node_repo = SqliteNodeRepository::new(db.connection().clone());
1673        let nodes = node_repo
1674            .find_by_branch(&feature_branch)
1675            .expect("find nodes");
1676        assert!(
1677            !nodes.is_empty(),
1678            "conventions should be persisted under feature branch"
1679        );
1680
1681        let main_nodes = node_repo
1682            .find_by_branch(&BranchId::from("main"))
1683            .expect("find nodes");
1684        assert!(
1685            main_nodes.is_empty(),
1686            "no conventions should be under main branch"
1687        );
1688    }
1689
1690    #[test]
1691    fn scan_project_with_source_map_produces_snippets() {
1692        let dir = tempdir().expect("create tempdir");
1693        let root = dir.path();
1694
1695        fs::create_dir_all(root.join(".git")).unwrap();
1696        fs::create_dir_all(root.join("src")).unwrap();
1697        fs::write(
1698            root.join("src/main.rs"),
1699            "use std::error::Error;\n\npub fn main() {}\n",
1700        )
1701        .unwrap();
1702
1703        let config = seshat_core::ScanConfig::default();
1704        let db = Database::open(":memory:").expect("open DB");
1705        let branch = BranchId::from("test-branch");
1706
1707        let result = scan_project(root, &config, &db, branch.clone()).expect("scan should succeed");
1708        assert!(
1709            !result.source_map.is_empty(),
1710            "source_map should contain files"
1711        );
1712
1713        let file_ir_repo = SqliteFileIRRepository::new(db.connection().clone());
1714        let files = file_ir_repo.get_by_branch(&branch).expect("get files");
1715        assert!(
1716            !files.is_empty(),
1717            "files should be stored under the scan branch"
1718        );
1719
1720        let main_files = file_ir_repo
1721            .get_by_branch(&BranchId::from("main"))
1722            .expect("get files");
1723        assert!(
1724            main_files.is_empty() || main_files.len() != files.len(),
1725            "files should NOT be stored under main branch when scanning a different branch"
1726        );
1727    }
1728}