Skip to main content

seshat_cli/
scan.rs

1//! Implementation of the `seshat scan <path>` command.
2//!
3//! Runs the full scan pipeline: discovery -> parse -> detect -> aggregate -> store,
4//! with uniform spinner-based progress display for all phases.
5
6use std::path::Path;
7use std::time::Instant;
8
9use indicatif::{ProgressBar, ProgressStyle};
10use seshat_core::{BranchId, DetectionConfig};
11use seshat_detectors::{aggregate_findings, run_all_detectors};
12use seshat_scanner::{
13    ScanProgress, ScanResult, detect_submodule_paths, scan_project_with_progress,
14};
15use seshat_storage::{
16    Database, EmbeddingInput, EmbeddingRepository, RepoMetadataRepository,
17    SqliteEmbeddingRepository, SqliteRepoMetadataRepository, SqliteSubmoduleRepository,
18    SubmoduleInput, SubmoduleRepository,
19};
20
21use crate::config::AppConfig;
22use crate::db::unix_now;
23use crate::error::CliError;
24use crate::format::{self, Verbosity};
25
26/// Run the scan command on the given project directory.
27///
28/// # Pipeline
29///
30/// 1. Validate path
31/// 2. Load config from `seshat.toml` (or defaults)
32/// 3. Open database in XDG data directory
33/// 4. Run scan pipeline with progress reporting
34/// 5. Run convention detectors
35/// 6. Aggregate findings
36/// 7. Print report (verbosity-aware)
37pub fn run_scan(
38    path: &Path,
39    verbose: bool,
40    quiet: bool,
41    exclude_submodules: bool,
42) -> Result<(), CliError> {
43    let verbosity = Verbosity::from_flags(verbose, quiet);
44    let color = format::color_enabled();
45
46    // -- Validate path ------------------------------------------------
47    if !path.exists() {
48        return Err(CliError::InvalidPath {
49            path: path.display().to_string(),
50            reason: "path does not exist".to_owned(),
51        });
52    }
53    if !path.is_dir() {
54        return Err(CliError::InvalidPath {
55            path: path.display().to_string(),
56            reason: "path is not a directory".to_owned(),
57        });
58    }
59
60    // Resolve the project through the SHARED resolver: walks up to the git
61    // common-dir parent so all worktrees of a single repo land in one DB.
62    // For non-git directories the resolver canonicalises the input and uses
63    // its basename, matching the legacy scan-from-cwd behaviour.
64    let resolved = crate::db::resolve_project(Some(path), "scan")?;
65    let root = resolved.project_root.clone();
66    let db_path = resolved.db_path.clone();
67    let project_name = resolved.project_name.clone();
68
69    // -- Version header -----------------------------------------------
70    if verbosity.show_warnings() {
71        eprintln!("seshat v{}", env!("CARGO_PKG_VERSION"));
72    }
73
74    // -- Load config --------------------------------------------------
75    let mut config =
76        AppConfig::load().map_err(|e| CliError::scan(format!("failed to load config: {e}")))?;
77
78    // CLI flag overrides config file value.
79    if exclude_submodules {
80        config.scan.exclude_submodules = true;
81    }
82
83    // -- Open database ------------------------------------------------
84    if let Some(parent) = db_path.parent() {
85        std::fs::create_dir_all(parent)
86            .map_err(|e| CliError::scan(format!("failed to create database directory: {e}")))?;
87    }
88    let db = Database::open(&db_path)
89        .map_err(|e| CliError::scan(format!("failed to open database: {e}")))?;
90
91    // -- Detect submodules early (before root scan) --------------------
92    let submodule_paths = detect_submodule_paths(&root);
93
94    // Detect git branch for scan scoping.
95    let scan_branch = crate::db::get_current_branch(&root)
96        .map(seshat_core::BranchId::from)
97        .unwrap_or_else(|| {
98            tracing::debug!(root = %root.display(), "Could not detect git branch for scan root, defaulting to 'main'");
99            seshat_core::BranchId::from("main")
100        });
101
102    // -- Scan submodules first (each gets its own DB) -----------------
103    let start = Instant::now();
104
105    let show = verbosity.show_warnings();
106
107    // -- Submodule scan phase -----------------------------------------
108    // Track scanned submodules for updating the root DB's submodules table.
109    struct ScannedSubmodule {
110        mount_path: String,
111        name: String,
112        db_path: String,
113        commit_hash: Option<String>,
114    }
115
116    // Look up stored submodule records from the root DB for change detection.
117    let root_sub_repo_for_detect = SqliteSubmoduleRepository::new(db.connection().clone());
118
119    // Scan submodules in parallel using std::thread::scope.
120    // Each submodule gets its own thread, DB connection, and spinner line.
121    // The root scan runs after all submodule threads complete.
122    let scanned_submodules: Vec<ScannedSubmodule> = if !config.scan.exclude_submodules
123        && !submodule_paths.is_empty()
124    {
125        // Pre-filter submodules: detect, check initialization, run change detection.
126        // This is done on the main thread since it's fast (no scanning).
127        enum SubmoduleAction {
128            Skip(ScannedSubmodule),
129            Scan {
130                mount_path: String,
131                name: String,
132                submodule_abs: std::path::PathBuf,
133                commit_hash: Option<String>,
134            },
135        }
136
137        let mut actions: Vec<SubmoduleAction> = Vec::new();
138
139        for mount_path in &submodule_paths {
140            let submodule_abs = root.join(mount_path);
141            let name = mount_path
142                .rsplit('/')
143                .next()
144                .unwrap_or(mount_path)
145                .to_string();
146
147            // Emit SubmoduleDetected for each discovered submodule.
148            if show {
149                eprintln!("  \u{2139} Submodule detected: {mount_path}");
150            }
151
152            // Check if initialized (non-empty dir with .git).
153            if !submodule_abs.is_dir()
154                || (!submodule_abs.join(".git").exists() && !submodule_abs.join(".git").is_file())
155            {
156                if show {
157                    let reason = "not initialized (no .git)";
158                    eprintln!("  \u{2298} Submodule {name} skipped: {reason}");
159                }
160                continue;
161            }
162
163            // Get the current commit hash for the submodule.
164            let commit_hash = seshat_scanner::get_head_commit(&submodule_abs);
165
166            // -- Change detection: compare current hash with stored hash ------
167            let stored_record = root_sub_repo_for_detect
168                .find_by_path(mount_path)
169                .map_err(|e| {
170                    CliError::scan(format!("failed to look up submodule '{mount_path}': {e}"))
171                })?;
172
173            if let Some(ref stored) = stored_record {
174                // Both hashes must be Some and equal for an up-to-date match.
175                if let (Some(current_hash), Some(stored_hash)) = (&commit_hash, &stored.commit_hash)
176                {
177                    if current_hash == stored_hash {
178                        // Commit hash matches — check whether the IR schema
179                        // version in the existing DB is still current.
180                        // If it isn't (e.g. IR_SCHEMA_VERSION was bumped since
181                        // the last scan), we must re-scan even though the files
182                        // haven't changed, so that all rows are rewritten with
183                        // the new schema version and become visible to queries.
184                        //
185                        // Use stored.db_path (already the resolved path written
186                        // by the previous scan) to open the submodule DB.
187                        let sub_branch_for_check = crate::db::get_current_branch(&submodule_abs)
188                            .unwrap_or_else(|| {
189                                tracing::debug!(submodule = %submodule_abs.display(), "Could not detect branch for submodule, defaulting to 'main'");
190                                "main".to_owned()
191                            });
192                        let schema_ok =
193                            seshat_storage::Database::open(std::path::Path::new(&stored.db_path))
194                                .ok()
195                                .map(|sub_db| {
196                                    crate::db::submodule_ir_schema_is_current(
197                                        &sub_db,
198                                        &sub_branch_for_check,
199                                    )
200                                })
201                                .unwrap_or(false); // can't open DB → force rescan
202
203                        if schema_ok {
204                            // Submodule is fully up-to-date — skip the scan.
205                            if show {
206                                let short = if current_hash.len() >= 7 {
207                                    &current_hash[..7]
208                                } else {
209                                    current_hash
210                                };
211                                eprintln!("  \u{2713} Submodule {name} up-to-date ({short})");
212                            }
213
214                            actions.push(SubmoduleAction::Skip(ScannedSubmodule {
215                                mount_path: mount_path.clone(),
216                                name,
217                                db_path: stored.db_path.clone(),
218                                commit_hash,
219                            }));
220                            continue;
221                        }
222
223                        // Schema is stale — fall through to schedule a rescan.
224                        if show {
225                            eprintln!(
226                                "  \u{21bb} Submodule {name} IR schema outdated, re-scanning..."
227                            );
228                        }
229                    }
230                }
231            }
232
233            // Hash differs or submodule is new — schedule for parallel scan.
234            actions.push(SubmoduleAction::Scan {
235                mount_path: mount_path.clone(),
236                name,
237                submodule_abs,
238                commit_hash,
239            });
240        }
241
242        // Collect skipped submodules immediately, scan the rest in parallel.
243        let mut results: Vec<ScannedSubmodule> = Vec::new();
244        let mut to_scan: Vec<(String, String, std::path::PathBuf, Option<String>)> = Vec::new();
245
246        for action in actions {
247            match action {
248                SubmoduleAction::Skip(sub) => results.push(sub),
249                SubmoduleAction::Scan {
250                    mount_path,
251                    name,
252                    submodule_abs,
253                    commit_hash,
254                } => to_scan.push((mount_path, name, submodule_abs, commit_hash)),
255            }
256        }
257
258        if !to_scan.is_empty() {
259            // References shared across threads (read-only or thread-safe).
260            let scan_config = &config.scan;
261            let detection_config = &config.detection;
262            let project_name_ref = &project_name;
263
264            // Parallel scan via std::thread::scope — all threads join before scope exits.
265            let parallel_results: Vec<Result<ScannedSubmodule, CliError>> = std::thread::scope(
266                |scope| {
267                    let handles: Vec<_> = to_scan
268                        .iter()
269                        .map(|(mount_path, name, submodule_abs, commit_hash)| {
270                            let sp =
271                                make_manual_spinner(&format!("{name}: discovering files..."), show);
272
273                            scope.spawn(move || -> Result<ScannedSubmodule, CliError> {
274                                // Each thread opens its own DB connection.
275                                let sub_db_path = crate::db::resolve_submodule_db_path(
276                                    project_name_ref,
277                                    mount_path,
278                                )?;
279                                let sub_db = Database::open(&sub_db_path).map_err(|e| {
280                                    CliError::scan(format!(
281                                        "failed to open submodule database for '{mount_path}': {e}"
282                                    ))
283                                })?;
284
285                                // Detect branch from the submodule's git repo.
286                                let sub_branch = crate::db::get_current_branch(submodule_abs)
287                                    .map(seshat_core::BranchId::from)
288                                    .unwrap_or_else(|| {
289                                        tracing::debug!(submodule = %submodule_abs.display(), "Could not detect branch for submodule scan, defaulting to 'main'");
290                                        seshat_core::BranchId::from("main")
291                                    });
292
293                                // Run the full scan pipeline, updating the spinner
294                                // with phase info so the user sees progress.
295                                let scan_result = scan_project_with_progress(
296                                    submodule_abs,
297                                    scan_config,
298                                    &sub_db,
299                                    |event| {
300                                        match event {
301                                            ScanProgress::Discovering { count } => {
302                                                sp.set_message(format!(
303                                                    "{name}: discovering files... {count} found"
304                                                ));
305                                            }
306                                            ScanProgress::DiscoveryDone { total } => {
307                                                sp.set_message(format!(
308                                                    "{name}: discovering files... {total} found"
309                                                ));
310                                            }
311                                            ScanProgress::CollectingGitHistory => {
312                                                sp.set_message(format!(
313                                                    "{name}: collecting git history..."
314                                                ));
315                                            }
316                                            ScanProgress::Scanning { done, total } => {
317                                                sp.set_message(format!(
318                                                    "{name}: scanning files... {done}/{total}"
319                                                ));
320                                            }
321                                            ScanProgress::BuildingModuleGraph => {
322                                                sp.set_message(format!(
323                                                    "{name}: building module graph..."
324                                                ));
325                                            }
326                                            ScanProgress::AnalyzingProjectFiles => {
327                                                sp.set_message(format!(
328                                                    "{name}: analyzing manifests & docs..."
329                                                ));
330                                            }
331                                            _ => {}
332                                        }
333                                        sp.tick();
334                                    },
335                                    sub_branch.clone(),
336                                )
337                                .map_err(|e| {
338                                    CliError::scan(format!(
339                                        "submodule scan failed for '{mount_path}': {e}"
340                                    ))
341                                })?;
342
343                                sp.set_message(format!("{name}: analyzing conventions..."));
344                                sp.tick();
345
346                                let report = detect_and_persist(
347                                    &sub_db,
348                                    &sub_branch,
349                                    &detection_config.clone(),
350                                    &scan_result,
351                                )?;
352
353                                // Write repo_metadata to submodule DB.
354                                let meta =
355                                    SqliteRepoMetadataRepository::new(sub_db.connection().clone());
356                                write_metadata(
357                                    &meta,
358                                    &[
359                                        ("parent_project", project_name_ref),
360                                        ("mount_path", mount_path),
361                                        ("file_count", &report.file_count.to_string()),
362                                        ("convention_count", &report.convention_count.to_string()),
363                                        ("last_scan_time", &unix_now().to_string()),
364                                    ],
365                                )?;
366
367                                // Sentinel write moved into the scanner
368                                // orchestrator (P19) so every scan path
369                                // records last_scanned_commit automatically
370                                // — no per-caller wiring required here.
371
372                                sp.finish_with_message(format!(
373                                    "{name}: done ({} files, {} conventions)",
374                                    report.file_count, report.convention_count,
375                                ));
376
377                                Ok(ScannedSubmodule {
378                                    mount_path: mount_path.clone(),
379                                    name: name.clone(),
380                                    db_path: sub_db_path.to_string_lossy().to_string(),
381                                    commit_hash: commit_hash.clone(),
382                                })
383                            })
384                        })
385                        .collect();
386
387                    // Collect results from all threads.
388                    handles
389                        .into_iter()
390                        .map(|h| h.join().expect("submodule scan thread panicked"))
391                        .collect()
392                },
393            );
394
395            // Propagate any errors from parallel scans.
396            for result in parallel_results {
397                results.push(result?);
398            }
399        }
400
401        results
402    } else {
403        Vec::new()
404    };
405
406    // -- Run root scan with progress ----------------------------------
407    // Root scan is sequential (all submodules are done), so plain spinners
408    // are fine — no MultiProgress needed.
409    let discovery_sp = make_spinner("Discovering files...", show);
410
411    let git_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
412    let scan_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
413    let graph_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
414    let project_sp: std::cell::RefCell<Option<ProgressBar>> = std::cell::RefCell::new(None);
415
416    let scan_result = scan_project_with_progress(
417        &root,
418        &config.scan,
419        &db,
420        |event| match event {
421            ScanProgress::Discovering { count } => {
422                discovery_sp.set_message(format!("Discovering files... {count} found"));
423            }
424            ScanProgress::DiscoveryDone { total } => {
425                discovery_sp.finish_with_message(format!("Discovering files... {total} found"));
426            }
427            ScanProgress::CollectingGitHistory => {
428                *git_sp.borrow_mut() = Some(make_spinner("Collecting git history...", show));
429            }
430            ScanProgress::GitHistoryDone => {
431                if let Some(ref sp) = *git_sp.borrow() {
432                    sp.finish_with_message("Collecting git history... done");
433                }
434            }
435            ScanProgress::Scanning { done, total } => {
436                let mut sp_opt = scan_sp.borrow_mut();
437                if sp_opt.is_none() {
438                    *sp_opt = Some(make_spinner(&format!("Scanning files... 0/{total}"), show));
439                }
440                if let Some(ref sp) = *sp_opt {
441                    sp.set_message(format!("Scanning files... {done}/{total}"));
442                }
443            }
444            ScanProgress::ScanningDone => {
445                if let Some(ref sp) = *scan_sp.borrow() {
446                    sp.finish_with_message(sp.message().to_string());
447                }
448            }
449            ScanProgress::BuildingModuleGraph => {
450                *graph_sp.borrow_mut() = Some(make_spinner("Building module graph...", show));
451            }
452            ScanProgress::ModuleGraphDone => {
453                if let Some(ref sp) = *graph_sp.borrow() {
454                    sp.finish_with_message("Building module graph... done");
455                }
456            }
457            ScanProgress::AnalyzingProjectFiles => {
458                *project_sp.borrow_mut() =
459                    Some(make_spinner("Analyzing manifests & docs...", show));
460            }
461            ScanProgress::ProjectFilesDone => {
462                if let Some(ref sp) = *project_sp.borrow() {
463                    sp.finish_with_message("Analyzing manifests & docs... done");
464                }
465            }
466
467            // Submodule progress events are not emitted by the root orchestrator
468            // (submodules are scanned in a separate phase above), but the enum
469            // is exhaustive so we need a catch-all.
470            _ => {}
471        },
472        scan_branch.clone(),
473    )
474    .map_err(CliError::scan)?;
475
476    // -- Run convention detection + persistence on root ----------------
477    let detection_config = config.detection.clone();
478
479    let detect_sp = make_spinner("Analyzing conventions...", show);
480    let all_files = {
481        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
482        SqliteFileIRRepository::new(db.connection().clone())
483            .get_by_branch(&scan_branch)
484            .map_err(|e| CliError::scan(format!("failed to load files for detection: {e}")))?
485    };
486
487    // scan_result.source_map now contains source for ALL files (unchanged and
488    // changed alike) — the orchestrator keeps source in memory for every file
489    // it reads, not just the ones it re-parses.  So we can pass it directly
490    // to run_all_detectors and every file will go through detect_with_source,
491    // producing real snippets in convention evidence.
492    let file_count = all_files.len();
493    detect_sp.set_message(format!("Analyzing conventions... 0/{file_count}"));
494    let progress_cb = |done: usize, _total: usize| {
495        detect_sp.set_message(format!("Analyzing conventions... {done}/{file_count}"));
496    };
497    let project_context = seshat_detectors::ProjectContext::from_files(&all_files);
498    let detector_results = run_all_detectors(
499        &all_files,
500        &scan_result.source_map,
501        &detection_config,
502        &project_context,
503        Some(&progress_cb),
504    );
505    detect_sp.finish_with_message(format!(
506        "Analyzing conventions... {file_count}/{file_count}"
507    ));
508
509    let all_findings: Vec<seshat_core::ConventionFinding> = detector_results
510        .into_iter()
511        .flat_map(|dr| dr.findings)
512        .collect();
513
514    let file_dates_map: std::collections::HashMap<String, Option<i64>> = all_files
515        .iter()
516        .map(|f| {
517            let date = scan_result.file_dates.get(f.path.as_path()).copied();
518            (f.path.to_string_lossy().to_string(), date)
519        })
520        .collect();
521
522    let aggregated = aggregate_findings(
523        &all_findings,
524        &detection_config,
525        &file_dates_map,
526        unix_now(),
527    );
528
529    seshat_graph::persist_and_index(db.connection(), &scan_branch, &aggregated, &all_findings)
530        .map_err(|e| CliError::scan(format!("persist conventions: {e}")))?;
531
532    // -- Generate embeddings (optional) --------------------------------
533    // Pass changed_paths (not the full source_map) so that only new/changed
534    // files get re-embedded.  Unchanged files already have current embeddings
535    // in the DB and don't need to consume embedding API quota.
536    if let Some(ref embedding_config) = config.embedding {
537        generate_embeddings(
538            &db,
539            embedding_config,
540            &all_files,
541            &scan_result.source_map,
542            &scan_result.changed_paths,
543            &scan_branch.0,
544            show,
545        )?;
546    }
547
548    // -- Update root DB with submodule info + repo_metadata -----------
549    let root_sub_repo = SqliteSubmoduleRepository::new(db.connection().clone());
550
551    for sub in &scanned_submodules {
552        root_sub_repo
553            .upsert(&SubmoduleInput {
554                relative_path: sub.mount_path.clone(),
555                name: sub.name.clone(),
556                db_path: sub.db_path.clone(),
557                commit_hash: sub.commit_hash.clone(),
558            })
559            .map_err(|e| {
560                CliError::scan(format!(
561                    "failed to upsert submodule '{}' in root DB: {e}",
562                    sub.mount_path
563                ))
564            })?;
565    }
566
567    // Remove submodules from the root DB that are no longer in .gitmodules.
568    if let Ok(stored_submodules) = root_sub_repo.list() {
569        let active_paths: std::collections::HashSet<&str> =
570            submodule_paths.iter().map(|s| s.as_str()).collect();
571        for stored in &stored_submodules {
572            if !active_paths.contains(stored.relative_path.as_str()) {
573                let _ = root_sub_repo.delete(&stored.relative_path);
574            }
575        }
576    }
577
578    // Write repo_metadata to root DB.
579    let root_meta = SqliteRepoMetadataRepository::new(db.connection().clone());
580    write_metadata(
581        &root_meta,
582        &[
583            ("project_name", &project_name),
584            ("project_root", path.to_string_lossy().as_ref()),
585            ("file_count", &file_count.to_string()),
586            ("convention_count", &aggregated.len().to_string()),
587            ("last_scan_time", &unix_now().to_string()),
588        ],
589    )?;
590
591    // Sentinel write moved into the scanner orchestrator (P19); every
592    // scan path records last_scanned_commit automatically.
593
594    let elapsed = start.elapsed();
595
596    // -- Build report data and print ----------------------------------
597    let report_data = crate::report::build_report_data(
598        &scan_result,
599        &all_files,
600        aggregated,
601        &db_path,
602        elapsed,
603        config.scan.exclude_submodules,
604    );
605    crate::report::print_report(&report_data, verbosity, color);
606
607    Ok(())
608}
609
610/// Shared spinner style for the standard braille animation.
611fn spinner_style() -> ProgressStyle {
612    ProgressStyle::with_template("  {spinner:.cyan} {msg}")
613        .expect("valid template")
614        .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏", "✓"])
615}
616
617/// Create a spinner with automatic steady tick (80ms).
618///
619/// Use for main-thread spinners (root scan phases) where a background
620/// tick thread is safe and keeps the animation smooth.
621/// If `visible` is `false`, the spinner draws to a hidden target (silent mode).
622fn make_spinner(msg: &str, visible: bool) -> ProgressBar {
623    let sp = ProgressBar::new_spinner();
624    if visible {
625        sp.set_style(spinner_style());
626        sp.set_message(msg.to_owned());
627        sp.enable_steady_tick(std::time::Duration::from_millis(80));
628    } else {
629        sp.set_draw_target(indicatif::ProgressDrawTarget::hidden());
630    }
631    sp
632}
633
634/// Create a spinner driven manually via `tick()` + `set_message()`.
635///
636/// Use for worker-thread spinners (submodule scans) where the caller
637/// drives updates from progress callbacks. No background tick thread —
638/// avoids cursor-position races between the tick thread and the worker.
639fn make_manual_spinner(msg: &str, visible: bool) -> ProgressBar {
640    let sp = ProgressBar::new_spinner();
641    if visible {
642        sp.set_style(spinner_style());
643        sp.set_message(msg.to_owned());
644        sp.tick(); // draw initial frame
645    } else {
646        sp.set_draw_target(indicatif::ProgressDrawTarget::hidden());
647    }
648    sp
649}
650
651// ── Shared scan pipeline helpers ─────────────────────────────
652
653/// Result of [`detect_and_persist`] — counts for metadata writes.
654#[derive(Debug)]
655struct DetectionReport {
656    file_count: usize,
657    convention_count: usize,
658}
659
660/// Run convention detection, aggregation, and persistence on an already-scanned DB.
661///
662/// Delegates to [`seshat_graph::run_detection_cycle`] — the single authoritative
663/// implementation shared with the warm-tier watcher.
664fn detect_and_persist(
665    db: &Database,
666    scan_branch: &BranchId,
667    detection_config: &DetectionConfig,
668    scan_result: &ScanResult,
669) -> Result<DetectionReport, CliError> {
670    // Build file-date map from the scan result so trend computation has git dates.
671    let file_dates_map: std::collections::HashMap<String, Option<i64>> = scan_result
672        .file_dates
673        .iter()
674        .map(|(p, &ts)| (p.to_string_lossy().to_string(), Some(ts)))
675        .collect();
676
677    let report = seshat_graph::run_detection_cycle(
678        db.connection(),
679        scan_branch,
680        detection_config,
681        &file_dates_map,
682        &scan_result.source_map,
683    )
684    .map_err(|e| CliError::scan(format!("detection pipeline failed: {e}")))?;
685
686    Ok(DetectionReport {
687        file_count: report.file_count,
688        convention_count: report.convention_count,
689    })
690}
691
692/// Write multiple key-value pairs to a [`SqliteRepoMetadataRepository`].
693fn write_metadata(
694    repo: &SqliteRepoMetadataRepository,
695    pairs: &[(&str, &str)],
696) -> Result<(), CliError> {
697    for (key, value) in pairs {
698        repo.set(key, value)
699            .map_err(|e| CliError::scan(format!("failed to write metadata '{key}': {e}")))?;
700    }
701    Ok(())
702}
703
704/// Generate embeddings for all code items (functions, types, exports) in the project.
705///
706/// When an embedding provider is configured, this function:
707/// 1. Creates the provider from config
708/// 2. Collects all (function, type, export) items from all parsed files
709/// 3. Batches texts and calls the provider
710/// 4. Stores embeddings in the `code_embeddings` table
711///
712/// On failure (e.g., provider timeout, connection error), logs a warning and
713/// continues — embedding is optional and should never break the scan pipeline.
714fn generate_embeddings(
715    db: &Database,
716    embedding_config: &seshat_embedding::EmbeddingConfig,
717    all_files: &[seshat_core::ProjectFile],
718    source_map: &std::collections::HashMap<std::path::PathBuf, String>,
719    changed_paths: &std::collections::HashSet<std::path::PathBuf>,
720    branch_id: &str,
721    show: bool,
722) -> Result<(), CliError> {
723    let provider = match seshat_embedding::create_provider(embedding_config) {
724        Ok(p) => p,
725        Err(e) => {
726            tracing::warn!("Failed to create embedding provider: {e}");
727            if show {
728                eprintln!("  \u{26a0} Embedding provider unavailable: {e}");
729            }
730            return Ok(());
731        }
732    };
733
734    // Collect items to embed: (file_path, item_name, item_kind, text_to_embed)
735    let mut items: Vec<(String, String, String, String)> = Vec::new();
736    for file in all_files {
737        // Skip files that haven't changed — their embeddings are already
738        // current in the DB from the previous scan.  Only new/changed files
739        // (tracked in changed_paths) need fresh embeddings.
740        if !changed_paths.contains(&file.path) {
741            continue;
742        }
743        // Source is always present in source_map for changed files.
744        let source = match source_map.get(&file.path) {
745            Some(s) => s,
746            None => continue,
747        };
748
749        let file_path = file.path.to_string_lossy().to_string();
750
751        // Use source already in memory — no disk read needed.
752        let source_lines: Option<Vec<String>> = Some(source.lines().map(str::to_owned).collect());
753
754        // Build import context string: module names imported in this file.
755        // Filter empty module names (e.g. side-effect imports like `import './foo'`).
756        // Cap at 20 modules to avoid consuming the model's token budget with boilerplate.
757        let import_context = {
758            let modules: Vec<&str> = file
759                .imports
760                .iter()
761                .map(|i| i.module.as_str())
762                .filter(|m| !m.is_empty())
763                .take(20)
764                .collect();
765            if modules.is_empty() {
766                String::new()
767            } else {
768                format!("\nuses: {}", modules.join(", "))
769            }
770        };
771
772        for func in &file.functions {
773            let vis = if func.is_public { "pub " } else { "" };
774            let asyncness = if func.is_async { "async " } else { "" };
775            let params = func.parameters.join(", ");
776            let body_snippet =
777                extract_body_snippet(source_lines.as_deref(), func.line, func.end_line);
778            let text = format!(
779                "{vis}{asyncness}fn {}({params}) in {file_path}{body_snippet}{import_context}",
780                func.name
781            );
782            items.push((
783                file_path.clone(),
784                func.name.clone(),
785                "function".to_string(),
786                text,
787            ));
788        }
789        for ty in &file.types {
790            let vis = if ty.is_public { "pub " } else { "" };
791            // Use explicit match instead of Debug format to get human-readable labels
792            // (e.g. "type_alias" not "TypeAlias", "class" not "Class").
793            let kind = match ty.kind {
794                seshat_core::TypeDefKind::Struct => "struct",
795                seshat_core::TypeDefKind::Enum => "enum",
796                seshat_core::TypeDefKind::Trait => "trait",
797                seshat_core::TypeDefKind::Interface => "interface",
798                seshat_core::TypeDefKind::Class => "class",
799                seshat_core::TypeDefKind::TypeAlias => "type_alias",
800            };
801            let text = format!("{vis}{kind} {} in {file_path}{import_context}", ty.name);
802            items.push((file_path.clone(), ty.name.clone(), "type".to_string(), text));
803        }
804        for exp in &file.exports {
805            let default = if exp.is_default { "default " } else { "" };
806            let text = format!(
807                "export {default}{} in {file_path}{import_context}",
808                exp.name
809            );
810            items.push((
811                file_path.clone(),
812                exp.name.clone(),
813                "export".to_string(),
814                text,
815            ));
816        }
817    }
818
819    if items.is_empty() {
820        tracing::info!("No code items to embed");
821        return Ok(());
822    }
823
824    let total = items.len();
825    let batch_size = embedding_config.batch_size.max(1);
826    let embed_sp = make_spinner(&format!("Generating embeddings... 0/{total}"), show);
827
828    let conn = db.connection().clone();
829    let embedding_repo = SqliteEmbeddingRepository::new(conn);
830
831    // Build the set of all (file_path, item_name, item_kind) that SHOULD
832    // exist in the DB after this scan succeeds. This lets us diff against
833    // stored rows and prune embeddings from deleted/renamed files.
834    let mut current_keys: std::collections::HashSet<(String, String, String)> =
835        std::collections::HashSet::new();
836    for file in all_files {
837        let file_path = file.path.to_string_lossy().to_string();
838        for func in &file.functions {
839            current_keys.insert((file_path.clone(), func.name.clone(), "function".to_string()));
840        }
841        for ty in &file.types {
842            current_keys.insert((file_path.clone(), ty.name.clone(), "type".to_string()));
843        }
844        for exp in &file.exports {
845            current_keys.insert((file_path.clone(), exp.name.clone(), "export".to_string()));
846        }
847    }
848
849    // NOTE: We intentionally do NOT delete_by_branch here. If embedding
850    // generation fails mid-way (provider timeout, rate limit), we'd lose
851    // the previously complete embedding set with nothing to replace it.
852    // Instead we rely on upsert (ON CONFLICT DO UPDATE) and prune stale
853    // rows after a successful upsert by diffing current_keys against
854    // stored_keys — stale rows from deleted/renamed files are cleaned
855    // up without risking data loss.
856
857    let mut embedded_count: usize = 0;
858
859    let _embedding_outcome: Result<(), ()> = 'embed: {
860        for chunk in items.chunks(batch_size) {
861            let texts: Vec<String> = chunk.iter().map(|(_, _, _, text)| text.clone()).collect();
862
863            match provider.embed(&texts) {
864                Ok(embeddings) => {
865                    let inputs: Vec<EmbeddingInput> = chunk
866                        .iter()
867                        .zip(embeddings)
868                        .map(
869                            |((file_path, item_name, item_kind, _), emb)| EmbeddingInput {
870                                file_path: file_path.clone(),
871                                item_name: item_name.clone(),
872                                item_kind: item_kind.clone(),
873                                embedding: emb,
874                            },
875                        )
876                        .collect();
877
878                    if let Err(e) = embedding_repo.upsert_batch(branch_id, &inputs) {
879                        tracing::warn!("Failed to store embedding batch: {e}");
880                        embed_sp.finish_with_message(
881                            "Generating embeddings... failed (storage error)".to_string(),
882                        );
883                        break 'embed Err(());
884                    }
885
886                    embedded_count += chunk.len();
887                    embed_sp
888                        .set_message(format!("Generating embeddings... {embedded_count}/{total}"));
889                }
890                Err(e) => {
891                    tracing::warn!(
892                        embedded = embedded_count,
893                        total = total,
894                        remaining = total - embedded_count,
895                        "Embedding provider error mid-batch; {embedded_count}/{total} items stored, \
896                         {} items skipped. Database contains partial embeddings: {e}",
897                        total - embedded_count,
898                    );
899                    embed_sp.finish_with_message(format!(
900                        "Generating embeddings... failed ({embedded_count}/{total})"
901                    ));
902                    if show {
903                        eprintln!(
904                            "  \u{26a0} Embedding generation failed after {embedded_count}/{total} items \
905                             ({} skipped, partial state): {e}",
906                            total - embedded_count,
907                        );
908                    }
909                    break 'embed Err(());
910                }
911            }
912        }
913
914        embed_sp.finish_with_message(format!("Generating embeddings... {embedded_count}/{total}"));
915
916        tracing::info!(
917            count = embedded_count,
918            total = total,
919            "Generated code embeddings"
920        );
921
922        Ok(())
923    };
924
925    // Prune stale embedding rows from deleted/renamed files.
926    match embedding_repo.get_stored_keys(branch_id) {
927        Ok(stored_keys) => {
928            let stored_set: std::collections::HashSet<_> = stored_keys.into_iter().collect();
929            let stale: Vec<_> = stored_set.difference(&current_keys).cloned().collect();
930
931            if !stale.is_empty() {
932                match embedding_repo.delete_stale(branch_id, &stale) {
933                    Ok(pruned) => {
934                        tracing::info!(pruned = pruned, "Pruned {} stale embedding rows", pruned);
935                    }
936                    Err(e) => {
937                        tracing::warn!(
938                            "Failed to prune stale embedding rows: {e} (will retry next scan)"
939                        );
940                    }
941                }
942            }
943        }
944        Err(e) => {
945            tracing::warn!(
946                "Failed to query stored embedding keys for stale cleanup: {e} (will retry next scan)"
947            );
948        }
949    }
950
951    Ok(())
952}
953
954/// Extract a body snippet from source lines for use in embedding text.
955///
956/// Returns the first `HEAD_LINES` lines and last `TAIL_LINES` lines of the
957/// function body (1-indexed, inclusive). If the function is short enough to
958/// fit in HEAD_LINES + TAIL_LINES, returns all lines without duplication.
959///
960/// Returns an empty string if source lines are not available or line range
961/// is out of bounds.
962fn extract_body_snippet(
963    source_lines: Option<&[String]>,
964    start_line: usize,
965    end_line: usize,
966) -> String {
967    const HEAD_LINES: usize = 5;
968    const TAIL_LINES: usize = 3;
969
970    let lines = match source_lines {
971        Some(l) if !l.is_empty() && start_line > 0 => l,
972        _ => return String::new(),
973    };
974
975    // Convert to 0-indexed, clamp to available lines.
976    let start = (start_line - 1).min(lines.len());
977    let end = end_line.min(lines.len());
978
979    if start >= end {
980        return String::new();
981    }
982
983    let body = &lines[start..end];
984
985    // If the body fits within HEAD + TAIL lines (no gap between them), return all
986    // lines — using ... only when there are lines that would be skipped.
987    let snippet = if body.len() <= HEAD_LINES + TAIL_LINES {
988        body.iter()
989            .map(String::as_str)
990            .collect::<Vec<_>>()
991            .join("\n")
992    } else {
993        let head: Vec<&str> = body.iter().take(HEAD_LINES).map(String::as_str).collect();
994        let tail: Vec<&str> = body
995            .iter()
996            .rev()
997            .take(TAIL_LINES)
998            .collect::<Vec<_>>()
999            .into_iter()
1000            .rev()
1001            .map(String::as_str)
1002            .collect();
1003        format!("{}\n...\n{}", head.join("\n"), tail.join("\n"))
1004    };
1005
1006    format!("\n{}", snippet.trim())
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011    use super::*;
1012    use seshat_scanner::scan_project;
1013    use seshat_storage::{
1014        Database, FileIRRepository, RepoMetadataRepository, SqliteFileIRRepository,
1015        SqliteRepoMetadataRepository, SqliteSubmoduleRepository, SubmoduleInput,
1016        SubmoduleRepository,
1017    };
1018    use std::fs;
1019    use tempfile::tempdir;
1020
1021    /// Helper: create a root project with a mock submodule directory.
1022    ///
1023    /// Layout:
1024    /// ```text
1025    /// root/
1026    ///   .git/
1027    ///   .gitmodules          (declares "frontend" submodule)
1028    ///   src/main.rs
1029    ///   frontend/
1030    ///     .git/              (marks it as an initialized submodule)
1031    ///     src/app.ts
1032    /// ```
1033    fn create_project_with_submodule() -> tempfile::TempDir {
1034        let dir = tempdir().expect("create tempdir");
1035        let root = dir.path();
1036
1037        // Root project
1038        fs::create_dir_all(root.join(".git")).unwrap();
1039        fs::create_dir_all(root.join("src")).unwrap();
1040        fs::write(
1041            root.join("src/main.rs"),
1042            "pub fn main() { println!(\"hello\"); }\n",
1043        )
1044        .unwrap();
1045
1046        // .gitmodules declaring the submodule
1047        fs::write(
1048            root.join(".gitmodules"),
1049            "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
1050        )
1051        .unwrap();
1052
1053        // Submodule directory (initialized with .git)
1054        fs::create_dir_all(root.join("frontend/.git")).unwrap();
1055        fs::create_dir_all(root.join("frontend/src")).unwrap();
1056        fs::write(
1057            root.join("frontend/src/app.ts"),
1058            "export function app(): string { return 'hello'; }\n",
1059        )
1060        .unwrap();
1061
1062        dir
1063    }
1064
1065    #[test]
1066    fn submodule_scan_creates_separate_dbs_with_correct_structure() {
1067        let dir = create_project_with_submodule();
1068        let root = dir.path();
1069        let config = seshat_core::ScanConfig::default();
1070
1071        // Create root DB and submodule DB (both in-memory for testing).
1072        let root_db = Database::open(":memory:").expect("open root DB");
1073        let sub_db = Database::open(":memory:").expect("open submodule DB");
1074
1075        // Scan root project (submodule dirs are excluded from root discovery).
1076        let root_result = scan_project(root, &config, &root_db, BranchId::from("main"))
1077            .expect("root scan should succeed");
1078        assert!(
1079            !root_result.excluded_submodules.is_empty(),
1080            "should detect submodule in .gitmodules"
1081        );
1082        assert_eq!(root_result.excluded_submodules, vec!["frontend"]);
1083
1084        // Root should only find main.rs (frontend is excluded).
1085        assert_eq!(
1086            root_result.files_discovered, 1,
1087            "root should discover 1 file (main.rs)"
1088        );
1089
1090        // Scan submodule directory into its own DB.
1091        let sub_root = root.join("frontend");
1092        let sub_result = scan_project(&sub_root, &config, &sub_db, BranchId::from("main"))
1093            .expect("submodule scan should succeed");
1094        assert_eq!(
1095            sub_result.files_discovered, 1,
1096            "submodule should discover 1 file (app.ts)"
1097        );
1098
1099        // Verify both DBs have IR records.
1100        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
1101        let branch = BranchId::from("main");
1102
1103        let root_files = SqliteFileIRRepository::new(root_db.connection().clone())
1104            .get_by_branch(&branch)
1105            .unwrap();
1106        assert_eq!(root_files.len(), 1, "root DB should have 1 file IR");
1107
1108        let sub_files = SqliteFileIRRepository::new(sub_db.connection().clone())
1109            .get_by_branch(&branch)
1110            .unwrap();
1111        assert_eq!(sub_files.len(), 1, "submodule DB should have 1 file IR");
1112
1113        // Write repo_metadata to submodule DB (as run_scan does).
1114        let sub_meta = SqliteRepoMetadataRepository::new(sub_db.connection().clone());
1115        sub_meta.set("parent_project", "my-project").unwrap();
1116        sub_meta.set("mount_path", "frontend").unwrap();
1117        sub_meta
1118            .set("file_count", &sub_result.files_discovered.to_string())
1119            .unwrap();
1120        sub_meta.set("convention_count", "0").unwrap();
1121        sub_meta.set("last_scan_time", "1700000000").unwrap();
1122
1123        assert_eq!(
1124            sub_meta.get("parent_project").unwrap().unwrap(),
1125            "my-project"
1126        );
1127        assert_eq!(sub_meta.get("mount_path").unwrap().unwrap(), "frontend");
1128        assert_eq!(sub_meta.get("file_count").unwrap().unwrap(), "1");
1129
1130        // Write submodule record to root DB (as run_scan does).
1131        let root_sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1132        root_sub_repo
1133            .insert(&SubmoduleInput {
1134                relative_path: "frontend".to_string(),
1135                name: "frontend".to_string(),
1136                db_path: "/data/seshat/repos/my-project/frontend.db".to_string(),
1137                commit_hash: None, // mock submodule has no real commits
1138            })
1139            .unwrap();
1140
1141        let stored = root_sub_repo.list().unwrap();
1142        assert_eq!(stored.len(), 1);
1143        assert_eq!(stored[0].relative_path, "frontend");
1144        assert_eq!(stored[0].name, "frontend");
1145
1146        // Write repo_metadata to root DB.
1147        let root_meta = SqliteRepoMetadataRepository::new(root_db.connection().clone());
1148        root_meta.set("project_name", "my-project").unwrap();
1149        root_meta
1150            .set("file_count", &root_result.files_discovered.to_string())
1151            .unwrap();
1152        root_meta.set("convention_count", "0").unwrap();
1153        root_meta.set("last_scan_time", "1700000000").unwrap();
1154
1155        assert_eq!(
1156            root_meta.get("project_name").unwrap().unwrap(),
1157            "my-project"
1158        );
1159        assert_eq!(root_meta.get("file_count").unwrap().unwrap(), "1");
1160    }
1161
1162    #[test]
1163    fn uninitialised_submodule_is_skipped() {
1164        let dir = tempdir().expect("create tempdir");
1165        let root = dir.path();
1166
1167        fs::create_dir_all(root.join(".git")).unwrap();
1168        fs::create_dir_all(root.join("src")).unwrap();
1169        fs::write(root.join("src/main.rs"), "pub fn main() {}\n").unwrap();
1170
1171        // .gitmodules declares a submodule that exists as a directory but has no .git
1172        fs::write(
1173            root.join(".gitmodules"),
1174            "[submodule \"libs/shared\"]\n\tpath = libs/shared\n\turl = https://example.com\n",
1175        )
1176        .unwrap();
1177        fs::create_dir_all(root.join("libs/shared")).unwrap();
1178        // No .git in libs/shared — it's not initialized
1179
1180        let config = seshat_core::ScanConfig::default();
1181        let db = Database::open(":memory:").expect("open DB");
1182
1183        let result =
1184            scan_project(root, &config, &db, BranchId::from("main")).expect("scan should succeed");
1185
1186        // Submodule dirs are always excluded from root discovery.
1187        assert_eq!(result.excluded_submodules, vec!["libs/shared"]);
1188        // Root only finds main.rs.
1189        assert_eq!(result.files_discovered, 1);
1190    }
1191
1192    #[test]
1193    fn submodule_removed_from_gitmodules_gets_deleted_from_table() {
1194        let root_db = Database::open(":memory:").expect("open DB");
1195        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1196
1197        // Simulate a previously scanned submodule in the table.
1198        sub_repo
1199            .insert(&SubmoduleInput {
1200                relative_path: "old-module".to_string(),
1201                name: "old-module".to_string(),
1202                db_path: "/data/repos/project/old-module.db".to_string(),
1203                commit_hash: Some("abc123".to_string()),
1204            })
1205            .unwrap();
1206
1207        // Current .gitmodules no longer includes "old-module".
1208        let active_paths: std::collections::HashSet<&str> = ["frontend"].iter().copied().collect();
1209
1210        let stored = sub_repo.list().unwrap();
1211        for stored_sub in &stored {
1212            if !active_paths.contains(stored_sub.relative_path.as_str()) {
1213                let _ = sub_repo.delete(&stored_sub.relative_path);
1214            }
1215        }
1216
1217        let remaining = sub_repo.list().unwrap();
1218        assert!(
1219            remaining.is_empty(),
1220            "old-module should have been removed from submodules table"
1221        );
1222    }
1223
1224    // -- US-005: Change detection unit tests --------------------------
1225
1226    /// Helper: determine if a submodule should be skipped based on stored vs current hash.
1227    /// Returns true if the scan should be skipped (hashes match).
1228    fn should_skip_submodule(stored_hash: Option<&str>, current_hash: Option<&str>) -> bool {
1229        match (current_hash, stored_hash) {
1230            (Some(current), Some(stored)) => current == stored,
1231            _ => false,
1232        }
1233    }
1234
1235    #[test]
1236    fn change_detection_skip_when_hashes_match() {
1237        // Both hashes are Some and equal → skip.
1238        assert!(should_skip_submodule(
1239            Some("abc123def456abc123def456abc123def456abc123"),
1240            Some("abc123def456abc123def456abc123def456abc123"),
1241        ));
1242    }
1243
1244    #[test]
1245    fn change_detection_rescan_when_hashes_differ() {
1246        // Both hashes are Some but different → rescan.
1247        assert!(!should_skip_submodule(
1248            Some("abc123def456abc123def456abc123def456abc123"),
1249            Some("000000def456abc123def456abc123def456abc123"),
1250        ));
1251    }
1252
1253    #[test]
1254    fn change_detection_rescan_when_no_stored_hash() {
1255        // Stored hash is None (first scan or no commits at previous scan) → rescan.
1256        assert!(!should_skip_submodule(
1257            None,
1258            Some("abc123def456abc123def456abc123def456abc123"),
1259        ));
1260    }
1261
1262    #[test]
1263    fn change_detection_rescan_when_no_current_hash() {
1264        // Current hash is None (submodule has no commits now) → rescan.
1265        assert!(!should_skip_submodule(
1266            Some("abc123def456abc123def456abc123def456abc123"),
1267            None,
1268        ));
1269    }
1270
1271    #[test]
1272    fn change_detection_rescan_when_both_hashes_none() {
1273        // Both hashes are None → rescan (can't confirm up-to-date).
1274        assert!(!should_skip_submodule(None, None));
1275    }
1276
1277    #[test]
1278    fn change_detection_new_submodule_triggers_full_scan() {
1279        // New submodule: not in the stored table at all → no stored record.
1280        let root_db = Database::open(":memory:").expect("open DB");
1281        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1282
1283        // Submodule "frontend" not in the table yet.
1284        let stored = sub_repo.find_by_path("frontend").unwrap();
1285        assert!(stored.is_none(), "new submodule should not be in table");
1286
1287        // Since there's no stored record, the change detection logic
1288        // will fall through to full scan (no match possible).
1289    }
1290
1291    #[test]
1292    fn change_detection_updated_hash_stored_after_rescan() {
1293        let root_db = Database::open(":memory:").expect("open DB");
1294        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1295
1296        // Insert a submodule with an old hash.
1297        let old_hash = "aaaa".repeat(10);
1298        sub_repo
1299            .insert(&SubmoduleInput {
1300                relative_path: "frontend".to_string(),
1301                name: "frontend".to_string(),
1302                db_path: "/data/repos/project/frontend.db".to_string(),
1303                commit_hash: Some(old_hash.clone()),
1304            })
1305            .unwrap();
1306
1307        // Simulate: current hash differs → rescan happened → update stored hash.
1308        let new_hash = "bbbb".repeat(10);
1309        sub_repo
1310            .update(&SubmoduleInput {
1311                relative_path: "frontend".to_string(),
1312                name: "frontend".to_string(),
1313                db_path: "/data/repos/project/frontend.db".to_string(),
1314                commit_hash: Some(new_hash.clone()),
1315            })
1316            .unwrap();
1317
1318        let stored = sub_repo.find_by_path("frontend").unwrap().unwrap();
1319        assert_eq!(
1320            stored.commit_hash.as_deref(),
1321            Some(new_hash.as_str()),
1322            "stored hash should be updated after rescan"
1323        );
1324
1325        // On the next scan, the hashes will match → skip.
1326        assert!(should_skip_submodule(
1327            stored.commit_hash.as_deref(),
1328            Some(&new_hash),
1329        ));
1330    }
1331
1332    #[test]
1333    fn change_detection_skipped_submodule_not_deleted_from_table() {
1334        let root_db = Database::open(":memory:").expect("open DB");
1335        let sub_repo = SqliteSubmoduleRepository::new(root_db.connection().clone());
1336
1337        let hash = "abcd".repeat(10);
1338        sub_repo
1339            .insert(&SubmoduleInput {
1340                relative_path: "frontend".to_string(),
1341                name: "frontend".to_string(),
1342                db_path: "/data/repos/project/frontend.db".to_string(),
1343                commit_hash: Some(hash.clone()),
1344            })
1345            .unwrap();
1346
1347        // Simulate: submodule was skipped (up-to-date) but still tracked in
1348        // the scanned_submodules list, so cleanup won't delete it.
1349        let active_paths: std::collections::HashSet<&str> = ["frontend"].iter().copied().collect();
1350
1351        let stored = sub_repo.list().unwrap();
1352        for stored_sub in &stored {
1353            if !active_paths.contains(stored_sub.relative_path.as_str()) {
1354                let _ = sub_repo.delete(&stored_sub.relative_path);
1355            }
1356        }
1357
1358        let remaining = sub_repo.list().unwrap();
1359        assert_eq!(
1360            remaining.len(),
1361            1,
1362            "skipped submodule should remain in table"
1363        );
1364        assert_eq!(remaining[0].relative_path, "frontend");
1365    }
1366
1367    // ── extract_body_snippet tests ────────────────────────────────────────────
1368
1369    fn make_lines(n: usize) -> Vec<String> {
1370        (1..=n).map(|i| format!("line_{i}")).collect()
1371    }
1372
1373    #[test]
1374    fn body_snippet_none_source_returns_empty() {
1375        assert_eq!(extract_body_snippet(None, 1, 5), "");
1376    }
1377
1378    #[test]
1379    fn body_snippet_start_zero_returns_empty() {
1380        let lines = make_lines(10);
1381        // start_line=0 is invalid (IR lines are 1-indexed)
1382        assert_eq!(extract_body_snippet(Some(&lines), 0, 5), "");
1383    }
1384
1385    #[test]
1386    fn body_snippet_single_line_function() {
1387        let lines = make_lines(20);
1388        // Function at line 5, single line
1389        let result = extract_body_snippet(Some(&lines), 5, 5);
1390        assert!(!result.is_empty());
1391        assert!(result.contains("line_5"));
1392    }
1393
1394    #[test]
1395    fn body_snippet_short_function_returns_all_lines() {
1396        let lines = make_lines(20);
1397        // Function lines 3-7 (5 lines) — fits in HEAD (5) without truncation
1398        let result = extract_body_snippet(Some(&lines), 3, 7);
1399        assert!(result.contains("line_3"));
1400        assert!(result.contains("line_7"));
1401        assert!(!result.contains("...")); // no truncation marker
1402    }
1403
1404    #[test]
1405    fn body_snippet_long_function_has_head_and_tail() {
1406        let lines = make_lines(50);
1407        // Function lines 1-50 — should produce head...tail
1408        let result = extract_body_snippet(Some(&lines), 1, 50);
1409        assert!(result.contains("line_1")); // head
1410        assert!(result.contains("line_5")); // head last
1411        assert!(result.contains("...")); // truncation marker
1412        assert!(result.contains("line_50")); // tail last
1413        assert!(result.contains("line_48")); // tail first
1414        // middle lines should NOT appear
1415        assert!(!result.contains("line_25"));
1416    }
1417
1418    #[test]
1419    fn body_snippet_exactly_boundary_no_overlap() {
1420        let lines = make_lines(20);
1421        // HEAD_LINES=5 + TAIL_LINES=3 = 8. Function with exactly 8 lines
1422        // should NOT produce ... (fits entirely)
1423        let result = extract_body_snippet(Some(&lines), 1, 8);
1424        assert!(
1425            !result.contains("..."),
1426            "8-line function should not be truncated"
1427        );
1428        assert!(result.contains("line_1"));
1429        assert!(result.contains("line_8")); // all 8 lines present
1430    }
1431
1432    #[test]
1433    fn body_snippet_trim_applied() {
1434        let lines = vec![
1435            "  fn foo() {".to_owned(),
1436            "    let x = 1;".to_owned(),
1437            "  }".to_owned(),
1438        ];
1439        let result = extract_body_snippet(Some(&lines), 1, 3);
1440        // Should start with \n then trimmed content
1441        assert!(result.starts_with('\n'));
1442        assert!(!result.starts_with("\n  ")); // leading whitespace trimmed
1443    }
1444
1445    #[test]
1446    fn body_snippet_empty_lines_returns_empty() {
1447        let lines: Vec<String> = Vec::new();
1448        assert_eq!(extract_body_snippet(Some(&lines), 1, 5), "");
1449    }
1450
1451    #[test]
1452    fn body_snippet_start_after_end_returns_empty() {
1453        // start_line > end_line is invalid — early return.
1454        let lines = make_lines(20);
1455        assert_eq!(extract_body_snippet(Some(&lines), 10, 5), "");
1456    }
1457
1458    #[test]
1459    fn body_snippet_end_line_clamped_to_available() {
1460        // end_line beyond available lines must clamp, not panic.
1461        let lines = make_lines(5);
1462        let result = extract_body_snippet(Some(&lines), 1, 999);
1463        assert!(result.contains("line_1"));
1464        assert!(result.contains("line_5"));
1465    }
1466
1467    #[test]
1468    fn body_snippet_start_at_last_line_returns_single_line() {
1469        let lines = make_lines(5);
1470        // start_line=5 → start=4, end=5.min(5)=5 → body = lines[4..5]
1471        let result = extract_body_snippet(Some(&lines), 5, 5);
1472        assert!(result.contains("line_5"));
1473        assert!(!result.contains("line_4"));
1474    }
1475
1476    #[test]
1477    fn body_snippet_start_past_lines_returns_empty() {
1478        // start_line - 1 == lines.len() (clamp), so start == end → empty.
1479        let lines = make_lines(3);
1480        assert_eq!(extract_body_snippet(Some(&lines), 4, 4), "");
1481    }
1482
1483    #[test]
1484    fn body_snippet_long_body_skips_middle_lines() {
1485        // Body of 15 lines: HEAD=5, TAIL=3 → 7 middle lines must be omitted.
1486        let lines = make_lines(20);
1487        let result = extract_body_snippet(Some(&lines), 1, 15);
1488        assert!(result.contains("line_1"));
1489        assert!(result.contains("line_5")); // HEAD ends
1490        assert!(!result.contains("line_6")); // first omitted
1491        assert!(!result.contains("line_10")); // middle omitted
1492        assert!(result.contains("line_13")); // TAIL begins
1493        assert!(result.contains("line_15")); // TAIL ends
1494        assert!(result.contains("..."));
1495    }
1496
1497    // ── Branch-aware detect_and_persist tests ──────────────────────────────────
1498
1499    #[test]
1500    fn detect_and_persist_uses_branch_id_for_loading_files() {
1501        let db = Database::open(":memory:").expect("open DB");
1502        let feature_branch = BranchId::from("feat/my-feature");
1503
1504        use seshat_core::test_helpers::make_project_file;
1505        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
1506
1507        let file = make_project_file(seshat_core::Language::Rust);
1508        SqliteFileIRRepository::new(db.connection().clone())
1509            .upsert(&feature_branch, &file, None)
1510            .expect("upsert file under feature branch");
1511
1512        let scan_result = seshat_scanner::ScanResult {
1513            files_discovered: 1,
1514            files_parsed: 1,
1515            nodes_persisted: 0,
1516            edges_persisted: 0,
1517            manifests_analyzed: 0,
1518            docs_ingested: 0,
1519            manifest_analyses: vec![],
1520            incremental: None,
1521            file_dates: std::collections::HashMap::new(),
1522            excluded_submodules: vec![],
1523            source_map: std::collections::HashMap::new(),
1524            changed_paths: std::collections::HashSet::new(),
1525        };
1526
1527        let config = DetectionConfig::default();
1528        let result = detect_and_persist(&db, &feature_branch, &config, &scan_result);
1529        assert!(
1530            result.is_ok(),
1531            "detect_and_persist should succeed: {result:?}"
1532        );
1533        let report = result.unwrap();
1534        assert_eq!(
1535            report.file_count, 1,
1536            "should find the file stored under feature branch"
1537        );
1538    }
1539
1540    #[test]
1541    fn detect_and_persist_returns_zero_for_wrong_branch() {
1542        let db = Database::open(":memory:").expect("open DB");
1543        let feature_branch = BranchId::from("feat/my-feature");
1544        let main_branch = BranchId::from("main");
1545
1546        use seshat_core::test_helpers::make_project_file;
1547        use seshat_storage::{FileIRRepository, SqliteFileIRRepository};
1548
1549        let file = make_project_file(seshat_core::Language::Rust);
1550        SqliteFileIRRepository::new(db.connection().clone())
1551            .upsert(&feature_branch, &file, None)
1552            .expect("upsert file under feature branch");
1553
1554        let scan_result = seshat_scanner::ScanResult {
1555            files_discovered: 1,
1556            files_parsed: 1,
1557            nodes_persisted: 0,
1558            edges_persisted: 0,
1559            manifests_analyzed: 0,
1560            docs_ingested: 0,
1561            manifest_analyses: vec![],
1562            incremental: None,
1563            file_dates: std::collections::HashMap::new(),
1564            excluded_submodules: vec![],
1565            source_map: std::collections::HashMap::new(),
1566            changed_paths: std::collections::HashSet::new(),
1567        };
1568
1569        let config = DetectionConfig::default();
1570        let result = detect_and_persist(&db, &main_branch, &config, &scan_result);
1571        assert!(result.is_ok());
1572        let report = result.unwrap();
1573        assert_eq!(report.file_count, 0, "main branch should have no files");
1574    }
1575
1576    #[test]
1577    fn detect_and_persist_persists_conventions_under_correct_branch() {
1578        let db = Database::open(":memory:").expect("open DB");
1579        let feature_branch = BranchId::from("feat/snippets");
1580
1581        use seshat_core::test_helpers::make_project_file;
1582        use seshat_storage::{
1583            FileIRRepository, NodeRepository, SqliteFileIRRepository, SqliteNodeRepository,
1584        };
1585
1586        let file = make_project_file(seshat_core::Language::Rust);
1587        SqliteFileIRRepository::new(db.connection().clone())
1588            .upsert(&feature_branch, &file, None)
1589            .expect("upsert file under feature branch");
1590
1591        let scan_result = seshat_scanner::ScanResult {
1592            files_discovered: 1,
1593            files_parsed: 1,
1594            nodes_persisted: 0,
1595            edges_persisted: 0,
1596            manifests_analyzed: 0,
1597            docs_ingested: 0,
1598            manifest_analyses: vec![],
1599            incremental: None,
1600            file_dates: std::collections::HashMap::new(),
1601            excluded_submodules: vec![],
1602            source_map: std::collections::HashMap::new(),
1603            changed_paths: std::collections::HashSet::new(),
1604        };
1605
1606        let config = DetectionConfig::default();
1607        let result = detect_and_persist(&db, &feature_branch, &config, &scan_result);
1608        assert!(result.is_ok());
1609
1610        let node_repo = SqliteNodeRepository::new(db.connection().clone());
1611        let nodes = node_repo
1612            .find_by_branch(&feature_branch)
1613            .expect("find nodes");
1614        assert!(
1615            !nodes.is_empty(),
1616            "conventions should be persisted under feature branch"
1617        );
1618
1619        let main_nodes = node_repo
1620            .find_by_branch(&BranchId::from("main"))
1621            .expect("find nodes");
1622        assert!(
1623            main_nodes.is_empty(),
1624            "no conventions should be under main branch"
1625        );
1626    }
1627
1628    #[test]
1629    fn scan_project_with_source_map_produces_snippets() {
1630        let dir = tempdir().expect("create tempdir");
1631        let root = dir.path();
1632
1633        fs::create_dir_all(root.join(".git")).unwrap();
1634        fs::create_dir_all(root.join("src")).unwrap();
1635        fs::write(
1636            root.join("src/main.rs"),
1637            "use std::error::Error;\n\npub fn main() {}\n",
1638        )
1639        .unwrap();
1640
1641        let config = seshat_core::ScanConfig::default();
1642        let db = Database::open(":memory:").expect("open DB");
1643        let branch = BranchId::from("test-branch");
1644
1645        let result = scan_project(root, &config, &db, branch.clone()).expect("scan should succeed");
1646        assert!(
1647            !result.source_map.is_empty(),
1648            "source_map should contain files"
1649        );
1650
1651        let file_ir_repo = SqliteFileIRRepository::new(db.connection().clone());
1652        let files = file_ir_repo.get_by_branch(&branch).expect("get files");
1653        assert!(
1654            !files.is_empty(),
1655            "files should be stored under the scan branch"
1656        );
1657
1658        let main_files = file_ir_repo
1659            .get_by_branch(&BranchId::from("main"))
1660            .expect("get files");
1661        assert!(
1662            main_files.is_empty() || main_files.len() != files.len(),
1663            "files should NOT be stored under main branch when scanning a different branch"
1664        );
1665    }
1666}