Skip to main content

sqry_core/graph/unified/build/
entrypoint.rs

1//! Build entrypoint for unified graph.
2//!
3//! This module provides the top-level API for building a unified graph from source files.
4//! It orchestrates file discovery and delegates to the 5-pass build pipeline.
5
6use std::fs;
7use std::path::{Path, PathBuf};
8use std::time::{Duration, Instant};
9
10use anyhow::{Context, Result};
11use ignore::WalkBuilder;
12use rayon::prelude::*;
13
14use crate::graph::GraphBuilderError;
15use crate::graph::error::GraphResult;
16use crate::graph::unified::analysis::LabelBudgetConfig;
17use crate::graph::unified::analysis::ReachabilityStrategy;
18use crate::graph::unified::build::StagingGraph;
19use crate::graph::unified::build::cancellation::CancellationToken;
20use crate::graph::unified::build::parallel_commit::{
21    GlobalOffsets, phase2_assign_ranges, phase3_parallel_commit, phase4_apply_global_remap,
22    phase4c_prime_unify_cross_file_nodes, phase4d_bulk_insert_edges,
23};
24use crate::graph::unified::build::pass3_intra::PendingEdge;
25use crate::graph::unified::build::progress::GraphBuildProgressTracker;
26use crate::graph::unified::concurrent::CodeGraph;
27use crate::io::FileReader;
28use crate::plugin::PluginManager;
29use crate::plugin::error::ParseError;
30use crate::plugin::{SafeParser, SafeParserConfig};
31use crate::progress::{SharedReporter, no_op_reporter};
32use crate::project::path_utils::normalize_path_components;
33
34/// Result of a successful build-and-persist operation.
35///
36/// Contains all metadata about the completed graph build, including
37/// canonical (deduplicated) edge counts, file counts by language, and
38/// provenance information.
39#[derive(Debug, Clone)]
40pub struct BuildResult {
41    /// Number of nodes in the graph.
42    pub node_count: usize,
43    /// Number of deduplicated edges (from analysis CSR, after merge/compaction).
44    /// This is the canonical edge count.
45    pub edge_count: usize,
46    /// Number of raw edges in the graph (CSR + delta buffer, before dedup).
47    /// Available for diagnostics; NOT the canonical count.
48    pub raw_edge_count: usize,
49    /// Number of indexed files, by language (e.g., `{"rust": 150, "python": 30}`).
50    ///
51    /// Counts files that entered the graph indexing pipeline and were
52    /// successfully parsed by a plugin. Not the same as "scanned files"
53    /// (all files walked by the directory scanner).
54    pub file_count: std::collections::HashMap<String, usize>,
55    /// Total number of indexed files.
56    pub total_files: usize,
57    /// ISO 8601 timestamp when the build completed.
58    pub built_at: String,
59    /// Root path that was indexed.
60    pub root_path: String,
61    /// Number of threads used for parallel file processing.
62    ///
63    /// Reflects the effective thread count from the rayon pool, not the
64    /// CLI-requested value. Useful for build diagnostics.
65    pub thread_count: usize,
66
67    /// Deterministic ordered built-in plugin ids active during the build.
68    pub active_plugin_ids: Vec<String>,
69
70    /// Reachability strategy used by each persisted analysis kind.
71    pub analysis_strategies: Vec<AnalysisStrategySummary>,
72}
73
74/// Persisted analysis strategy summary for one edge kind.
75#[derive(Debug, Clone, PartialEq, Eq)]
76pub struct AnalysisStrategySummary {
77    /// Stable edge-kind label (`calls`, `imports`, `references`, `inherits`).
78    pub edge_kind: &'static str,
79    /// Reachability strategy persisted for the edge kind.
80    pub strategy: ReachabilityStrategy,
81}
82
83/// Default staging memory limit per batch: 512 MB.
84///
85/// When the accumulated `StagingGraph` memory exceeds this threshold, the
86/// current batch is committed before parsing the next chunk. Override via
87/// `SQRY_STAGING_MEMORY_LIMIT_MB` or [`BuildConfig::staging_memory_limit`].
88const DEFAULT_STAGING_MEMORY_LIMIT: usize = 512 * 1024 * 1024;
89
90/// Directory names skipped by default when discovering first-party source files.
91///
92/// These are dependency, build output, editor cache, or CI runner cache roots
93/// that routinely contain generated code or vendored third-party dependencies.
94/// The indexer still honors `.gitignore` and related ignore files; this list
95/// protects editor-triggered indexing when those files are absent or incomplete.
96/// Set `SQRY_INCLUDE_DEFAULT_EXCLUDED_DIRS=1` to disable these built-in
97/// excludes for repositories that intentionally keep first-party code in one
98/// of these directories.
99const DEFAULT_EXCLUDED_SOURCE_DIRS: &[&str] = &[
100    ".git",
101    ".hg",
102    ".svn",
103    ".cache",
104    ".next",
105    ".nuxt",
106    ".sqry",
107    ".turbo",
108    ".venv",
109    "__pycache__",
110    "_actions",
111    "_update",
112    "_work",
113    "build",
114    "dist",
115    "node_modules",
116    "target",
117    "vendor",
118    "venv",
119];
120
121const DEFAULT_EXCLUDED_SOURCE_DIR_PREFIXES: &[&str] = &["externals."];
122
123/// Configuration for building the unified graph.
124#[derive(Debug, Clone)]
125pub struct BuildConfig {
126    /// Maximum directory depth to traverse (None = unlimited).
127    pub max_depth: Option<usize>,
128
129    /// Follow symbolic links.
130    pub follow_links: bool,
131
132    /// Include hidden files and directories.
133    pub include_hidden: bool,
134
135    /// Number of threads for parallel building (None = use default based on CPU count).
136    pub num_threads: Option<usize>,
137
138    /// Maximum staging memory (bytes) to accumulate before committing a batch.
139    ///
140    /// Controls the parse-commit chunking watermark. When the sum of all
141    /// in-flight `StagingGraph` buffers exceeds this limit, the batch is
142    /// committed to the graph before the next chunk of files is parsed.
143    ///
144    /// Defaults to 512 MB. Override via
145    /// `SQRY_STAGING_MEMORY_LIMIT_MB` environment variable.
146    pub staging_memory_limit: usize,
147
148    /// Configuration for the 2-hop label budget used during analysis.
149    ///
150    /// Controls the maximum number of intervals per edge kind and what
151    /// to do when the budget is exceeded (fail or degrade to BFS).
152    pub label_budget: LabelBudgetConfig,
153}
154
155impl Default for BuildConfig {
156    fn default() -> Self {
157        let limit = std::env::var("SQRY_STAGING_MEMORY_LIMIT_MB")
158            .ok()
159            .and_then(|v| v.parse::<usize>().ok())
160            .map_or(DEFAULT_STAGING_MEMORY_LIMIT, |mb| mb * 1024 * 1024);
161
162        let label_budget = LabelBudgetConfig {
163            budget_per_kind: 15_000_000,
164            on_exceeded: crate::graph::unified::analysis::BudgetExceededPolicy::Degrade,
165            density_gate_threshold: 64,
166            skip_labels: false,
167        };
168
169        Self {
170            max_depth: None,
171            follow_links: false,
172            include_hidden: false,
173            num_threads: None,
174            staging_memory_limit: limit,
175            label_budget,
176        }
177    }
178}
179
180/// Create a rayon thread pool sized by `BuildConfig::num_threads`.
181fn create_thread_pool(config: &BuildConfig) -> Result<rayon::ThreadPool> {
182    let mut builder = rayon::ThreadPoolBuilder::new();
183    if let Some(n) = config.num_threads {
184        builder = builder.num_threads(n);
185    }
186    builder
187        .build()
188        .context("Failed to create rayon thread pool for parallel indexing")
189}
190
191/// Compute chunk boundaries for memory-bounded parallel parse batches.
192///
193/// Splits `files` into non-overlapping ranges where each chunk's estimated
194/// staging memory stays within `memory_limit`. Uses source file size as a
195/// proxy for staging buffer size (multiplied by an expansion factor to
196/// account for AST node/edge/string overhead).
197///
198/// Returns at least one chunk even if the first file alone exceeds the limit.
199fn compute_parse_chunks(
200    files: &[PathBuf],
201    _pool: &rayon::ThreadPool,
202    _plugins: &PluginManager,
203    memory_limit: usize,
204) -> Vec<std::ops::Range<usize>> {
205    // Expansion factor: staging buffers are typically 2-8x the source file
206    // size due to AST nodes, edges, and interned strings. Use 4x as a
207    // conservative middle ground.
208    const EXPANSION_FACTOR: usize = 4;
209
210    let mut chunks = Vec::new();
211    let mut chunk_start = 0;
212    let mut chunk_estimate = 0usize;
213
214    for (i, path) in files.iter().enumerate() {
215        #[allow(clippy::cast_possible_truncation)] // File sizes always fit usize on 32/64-bit.
216        let file_size = std::fs::metadata(path)
217            .map(|m| m.len() as usize)
218            .unwrap_or(0);
219        let estimated_staging = file_size * EXPANSION_FACTOR;
220
221        // If adding this file would exceed the limit and we already have
222        // files in the chunk, finalize the current chunk first.
223        if chunk_estimate + estimated_staging > memory_limit && i > chunk_start {
224            chunks.push(chunk_start..i);
225            chunk_start = i;
226            chunk_estimate = 0;
227        }
228        chunk_estimate += estimated_staging;
229    }
230
231    // Final chunk (always push — handles single-chunk and trailing files)
232    if chunk_start < files.len() {
233        chunks.push(chunk_start..files.len());
234    }
235
236    if chunks.len() > 1 {
237        log::info!(
238            "Memory-bounded chunking: {} batches for {} files (limit: {} MB)",
239            chunks.len(),
240            files.len(),
241            memory_limit / (1024 * 1024),
242        );
243    }
244
245    chunks
246}
247
248/// Phase name for file processing during graph build.
249pub const GRAPH_FILE_PROCESSING_PHASE: &str = "File processing";
250
251/// Build a unified graph from source files.
252///
253/// This function:
254/// 1. Walks the file tree starting at `root`
255/// 2. For each file, extracts symbols using the appropriate language plugin
256/// 3. Runs the 5-pass build pipeline to populate the graph
257/// 4. Returns the completed `CodeGraph`
258///
259/// # Arguments
260///
261/// * `root` - Root directory to scan for source files
262/// * `plugins` - Plugin manager for language-specific extraction
263/// * `config` - Build configuration
264///
265/// # Returns
266///
267/// A `CodeGraph` containing the populated graph.
268///
269/// # Errors
270///
271/// Returns an error if:
272/// - The root path does not exist
273/// - No graph builders are registered
274/// - All eligible files fail to build (per-file failures are logged and skipped)
275///
276/// # Example
277///
278/// ```ignore
279/// use sqry_core::graph::unified::build::{build_unified_graph, BuildConfig};
280/// use sqry_core::plugin::PluginManager;
281/// use std::path::Path;
282///
283/// let plugins = sqry_plugin_registry::create_plugin_manager();
284/// let config = BuildConfig::default();
285/// let graph = build_unified_graph(Path::new("src"), &plugins, &config)?;
286/// println!("Created graph with {} nodes", graph.node_count());
287/// ```
288pub fn build_unified_graph(
289    root: &Path,
290    plugins: &PluginManager,
291    config: &BuildConfig,
292) -> Result<CodeGraph> {
293    build_unified_graph_cancellable(root, plugins, config, &CancellationToken::default())
294        .map_err(anyhow::Error::from)
295}
296
297/// Build a unified graph from source files with progress reporting.
298///
299/// This is the same as [`build_unified_graph`] but accepts a progress reporter
300/// for tracking build progress.
301///
302/// # Arguments
303///
304/// * `root` - Root directory to scan for source files
305/// * `plugins` - Plugin manager for language-specific extraction
306/// * `config` - Build configuration
307/// * `progress` - Progress reporter for build status updates
308///
309/// # Returns
310///
311/// A `CodeGraph` containing the populated graph.
312///
313/// # Errors
314///
315/// Returns an error if the path is missing, no graph builders are registered,
316/// or all eligible files fail to build.
317pub fn build_unified_graph_with_progress(
318    root: &Path,
319    plugins: &PluginManager,
320    config: &BuildConfig,
321    progress: SharedReporter,
322) -> Result<(CodeGraph, usize)> {
323    build_unified_graph_with_progress_cancellable(
324        root,
325        plugins,
326        config,
327        progress,
328        &CancellationToken::default(),
329    )
330    .map_err(anyhow::Error::from)
331}
332
333/// Build a unified graph with cooperative cancellation.
334///
335/// Behaves identically to [`build_unified_graph`] except that the
336/// `cancellation` token is polled at every pass boundary. A cancelled
337/// token causes the pipeline to return [`GraphBuilderError::Cancelled`]
338/// at the next boundary.
339///
340/// Used by the sqryd daemon's rebuild dispatcher to abort in-flight
341/// full rebuilds when a workspace is evicted mid-build.
342///
343/// # Errors
344///
345/// Returns [`GraphBuilderError::Cancelled`] if the token is cancelled
346/// at any pass boundary; otherwise the same error modes as
347/// [`build_unified_graph`] (lifted from `anyhow::Error` into
348/// [`GraphBuilderError::Internal`]).
349pub fn build_unified_graph_cancellable(
350    root: &Path,
351    plugins: &PluginManager,
352    config: &BuildConfig,
353    cancellation: &CancellationToken,
354) -> GraphResult<CodeGraph> {
355    let (graph, _effective_threads) =
356        build_unified_graph_inner(root, plugins, config, no_op_reporter(), cancellation)?;
357    Ok(graph)
358}
359
360/// Build a unified graph with cooperative cancellation AND a progress
361/// reporter.
362///
363/// Combines [`build_unified_graph_cancellable`] + the progress
364/// reporter variant.
365///
366/// # Errors
367///
368/// Same as [`build_unified_graph_cancellable`].
369pub fn build_unified_graph_with_progress_cancellable(
370    root: &Path,
371    plugins: &PluginManager,
372    config: &BuildConfig,
373    progress: SharedReporter,
374    cancellation: &CancellationToken,
375) -> GraphResult<(CodeGraph, usize)> {
376    build_unified_graph_inner(root, plugins, config, progress, cancellation)
377}
378
379/// Internal implementation that returns the effective thread count alongside the graph.
380///
381/// Used by [`build_and_persist_graph_with_progress`] to propagate the thread count
382/// into `BuildResult` without exposing it in the public API.
383///
384/// Accepts a [`CancellationToken`] which is polled at every pass
385/// boundary. Callers that do not need cancellation pass
386/// `&CancellationToken::default()` (via the `build_unified_graph` +
387/// `build_unified_graph_with_progress` wrappers).
388#[allow(clippy::too_many_lines)] // Complex 5-pass build pipeline requires sequential flow
389fn build_unified_graph_inner(
390    root: &Path,
391    plugins: &PluginManager,
392    config: &BuildConfig,
393    progress: SharedReporter,
394    cancellation: &CancellationToken,
395) -> GraphResult<(CodeGraph, usize)> {
396    if !root.exists() {
397        return Err(GraphBuilderError::Internal {
398            reason: format!("Path {} does not exist", root.display()),
399        });
400    }
401
402    log::info!(
403        "Building unified graph from source files in {}",
404        root.display()
405    );
406
407    // 7c cancellation boundary 1: pre-build, after arg validation.
408    cancellation.check()?;
409
410    let has_graph_builders = plugins
411        .plugins()
412        .iter()
413        .any(|plugin| plugin.graph_builder().is_some());
414    if !has_graph_builders {
415        return Err(GraphBuilderError::Internal {
416            reason: "No graph builders registered – cannot build code graph".to_string(),
417        });
418    }
419
420    // Create progress tracker for this build
421    let tracker = GraphBuildProgressTracker::new(progress);
422
423    // 1. Find source files
424    let mut files = find_source_files(root, config);
425    sort_files_for_build(root, &mut files);
426
427    // 7c cancellation boundary 2: after file discovery, before thread
428    // pool creation + graph allocation.
429    cancellation.check()?;
430
431    // 2. Create the unified graph
432    let mut graph = CodeGraph::new();
433
434    // 3. Create scoped thread pool for parallel parse
435    let pool = create_thread_pool(config).map_err(|e| GraphBuilderError::Internal {
436        reason: format!("thread pool: {e}"),
437    })?;
438    let effective_threads = pool.current_num_threads();
439    log::info!("Parallel indexing: using {effective_threads} threads");
440
441    // Chunked parallel-parse / parallel-commit pipeline.
442    //
443    // Files are processed in memory-bounded batches (chunks). Each chunk:
444    //   Phase 1: Parse files in parallel (rayon thread pool)
445    //   Phase 2: Count + prefix-sum range assignment
446    //   Phase 3: Parallel commit into disjoint pre-allocated arena/interner ranges
447    //   Phase 4: After ALL chunks — string dedup, global remap, index build, edge bulk insert
448    //
449    // The batch boundary is determined by `staging_memory_limit`: once the
450    // accumulated staging buffer size exceeds the watermark, the current
451    // batch is committed before more files are parsed. This prevents OOM
452    // on large repositories where holding all StagingGraphs simultaneously
453    // would exhaust available RAM.
454    let total_files = files.len();
455    tracker.start_phase(
456        1,
457        "Chunked structural indexing (parse -> range-plan -> semantic commit)",
458        total_files,
459    );
460
461    let (mut succeeded, mut parse_errors, mut skipped, mut timed_out) =
462        (0usize, 0usize, 0usize, 0usize);
463    let mut total_staging_bytes = 0usize;
464    let mut peak_chunk_staging_bytes = 0usize;
465    let mut max_file_staging_bytes = 0usize;
466
467    // Global offsets track running positions across chunks.
468    // For a fresh graph: node arena starts at 0 slots, string interner at 1 (sentinel).
469    let initial_string_offset = graph.strings_mut().alloc_range(0).unwrap_or(1);
470    let mut offsets = GlobalOffsets {
471        node_offset: u32::try_from(graph.nodes().slot_count()).unwrap_or(0),
472        string_offset: initial_string_offset,
473    };
474    // Collect all edges across chunks for Phase 4 bulk insert.
475    let mut all_edges: Vec<Vec<PendingEdge>> = Vec::new();
476
477    let chunks = compute_parse_chunks(&files, &pool, plugins, config.staging_memory_limit);
478    for chunk_range in chunks {
479        // 7c cancellation boundary 3: top of each chunk iteration.
480        cancellation.check()?;
481
482        let chunk_files = &files[chunk_range];
483
484        // 7c test hook: observation point fired at the top of each
485        // chunk. Tests that need to flip the cancellation token
486        // between chunks register a callback here. Production builds
487        // compile this call out entirely.
488        #[cfg(any(test, feature = "rebuild-internals"))]
489        testing::fire_after_chunk_hook(cancellation);
490
491        // Phase 1: Parallel parse this chunk
492        let staged_results: Vec<(PathBuf, Result<ParsedFileOutcome>)> = pool.install(|| {
493            chunk_files
494                .par_iter()
495                .map(|path| {
496                    let result = parse_file(path.as_path(), plugins);
497                    tracker.increment_progress();
498                    (path.clone(), result)
499                })
500                .collect()
501        });
502
503        // Separate successful parses from errors/skips
504        let mut chunk_parsed: Vec<(PathBuf, ParsedFile)> = Vec::new();
505        let mut chunk_staging_bytes = 0usize;
506        for (path, result) in staged_results {
507            match result {
508                Ok(ParsedFileOutcome::Parsed(parsed)) => {
509                    let file_bytes = parsed.staging.estimated_byte_size();
510                    total_staging_bytes += file_bytes;
511                    chunk_staging_bytes += file_bytes;
512                    if file_bytes > max_file_staging_bytes {
513                        max_file_staging_bytes = file_bytes;
514                    }
515                    chunk_parsed.push((path, parsed));
516                }
517                Ok(ParsedFileOutcome::Skipped) => skipped += 1,
518                Ok(ParsedFileOutcome::TimedOut {
519                    file,
520                    phase,
521                    timeout_ms,
522                }) => {
523                    timed_out += 1;
524                    log::warn!(
525                        "Timed out building graph for {} during {} after {} ms",
526                        file.display(),
527                        phase,
528                        timeout_ms,
529                    );
530                }
531                Err(e) => {
532                    parse_errors += 1;
533                    log::warn!("Failed to parse {}: {e}", path.display());
534                }
535            }
536        }
537        if chunk_staging_bytes > peak_chunk_staging_bytes {
538            peak_chunk_staging_bytes = chunk_staging_bytes;
539        }
540
541        if chunk_parsed.is_empty() {
542            continue;
543        }
544
545        // Register files in batch
546        let file_info: Vec<_> = chunk_parsed
547            .iter()
548            .map(|(path, parsed)| (path.clone(), Some(parsed.language)))
549            .collect();
550        let file_ids = graph.files_mut().register_batch(&file_info).map_err(|e| {
551            GraphBuilderError::Internal {
552                reason: format!("Failed to register files: {e}"),
553            }
554        })?;
555
556        // Phase 2: Count + range assignment (fast, no progress needed)
557        let staging_refs: Vec<_> = chunk_parsed.iter().map(|(_, p)| &p.staging).collect();
558        let plan = phase2_assign_ranges(&staging_refs, &file_ids, &offsets);
559
560        // Pre-allocate arena and interner ranges for Phase 3.
561        let placeholder = crate::graph::unified::storage::NodeEntry::new(
562            crate::graph::unified::node::NodeKind::Other,
563            crate::graph::unified::string::StringId::new(0),
564            crate::graph::unified::file::FileId::new(0),
565        );
566        graph
567            .nodes_mut()
568            .alloc_range(plan.total_nodes, &placeholder)
569            .map_err(|e| GraphBuilderError::Internal {
570                reason: format!("Failed to alloc node range: {e:?}"),
571            })?;
572        graph
573            .strings_mut()
574            .alloc_range(plan.total_strings)
575            .map_err(|e| GraphBuilderError::Internal {
576                reason: format!("Failed to alloc string range: {e}"),
577            })?;
578
579        // Phase 3: Parallel commit into disjoint pre-allocated ranges.
580        // Use pool.install to respect BuildConfig::num_threads for rayon par_iter.
581        //
582        // `phase3_parallel_commit` is generic over
583        // `G: GraphMutationTarget` as of Task 4 Step 4 Phase 1; here
584        // the inferred `G` is `CodeGraph`, and the helper reaches the
585        // arena + interner via `graph.nodes_and_strings_mut()`
586        // internally.
587        let phase3 = pool.install(|| phase3_parallel_commit(&plan, &staging_refs, &mut graph));
588
589        // Validate written counts match plan. A mismatch indicates a bug in
590        // StagingGraph counting — abort the build to prevent phantom entries
591        // and inconsistent file registry state.
592        let expected_nodes = plan.total_nodes as usize;
593        let expected_strings = plan.total_strings as usize;
594        let expected_edges = usize::try_from(plan.total_edges)
595            .unwrap_or_else(|_| unreachable!("edge count does not fit usize"));
596        if phase3.total_nodes_written != expected_nodes
597            || phase3.total_strings_written != expected_strings
598            || phase3.total_edges_collected != expected_edges
599        {
600            return Err(GraphBuilderError::Internal {
601                reason: format!(
602                    "Phase 3 count mismatch: nodes {}/{expected_nodes}, strings {}/{expected_strings}, edges {}/{expected_edges}. This indicates a bug in StagingGraph counting.",
603                    phase3.total_nodes_written,
604                    phase3.total_strings_written,
605                    phase3.total_edges_collected,
606                ),
607            });
608        }
609
610        // Populate FileSegmentTable from the chunk's file plans.
611        for fp in &plan.file_plans {
612            let start = fp.node_range.start;
613            let count = fp.node_range.end.saturating_sub(start);
614            graph
615                .file_segments_mut()
616                .record_range(fp.file_id, start, count);
617        }
618
619        // Populate FileRegistry::per_file_nodes from Phase 3's
620        // committed-NodeId vectors. This is the Gate 0c iter-2 B2 fix
621        // (pulled base-plan Step 1 forward): each NodeId committed by
622        // parallel-parse is bucketed by its owning FileId so the
623        // bucket-bijection debug invariant at publish time can verify
624        // arena ↔ bucket consistency against real data instead of a
625        // vacuously-empty map.
626        //
627        // Iteration order matches `plan.file_plans`, which is
628        // deterministic across runs. `per_file_node_ids[i]` is the
629        // set of NodeIds committed for `plan.file_plans[i]`; the
630        // registry's `record_node` is O(1) amortised per call.
631        debug_assert_eq!(
632            phase3.per_file_node_ids.len(),
633            plan.file_plans.len(),
634            "phase3 per-file node ID vector length must match plan length"
635        );
636        for (fp, node_ids) in plan.file_plans.iter().zip(phase3.per_file_node_ids.iter()) {
637            for nid in node_ids {
638                graph.files_mut().record_node(fp.file_id, *nid);
639            }
640        }
641
642        succeeded += chunk_parsed.len();
643
644        // Merge confidence metadata from parsed files
645        for (_path, parsed) in &mut chunk_parsed {
646            if let Some(confidence) = parsed.staging.take_confidence() {
647                let language_name = parsed.language.to_string();
648                graph.merge_confidence(&language_name, confidence);
649            }
650        }
651
652        // Update global offsets for next chunk
653        offsets.node_offset += plan.total_nodes;
654        offsets.string_offset += plan.total_strings;
655
656        // 7c cancellation boundary 4: after chunk commit, before
657        // accumulating edges for Phase 4.
658        cancellation.check()?;
659
660        // Accumulate edges for Phase 4
661        all_edges.extend(phase3.per_file_edges);
662    }
663    tracker.complete_phase();
664
665    // 7c test hook: observation point fired after the chunk loop exits
666    // and before Phase 4 finalization. Tests that need to flip the
667    // cancellation token at this boundary register a callback here.
668    #[cfg(any(test, feature = "rebuild-internals"))]
669    testing::fire_before_phase4_hook(cancellation);
670
671    // Phase 4: Post-chunk finalization
672    tracker.start_phase(4, "Finalizing graph", 5);
673
674    // 7c cancellation boundary 5: pre-Phase-4a.
675    cancellation.check()?;
676
677    // Phase 4a: Global string dedup
678    let string_remap = graph.strings_mut().build_dedup_table();
679    if !string_remap.is_empty() {
680        log::debug!(
681            "Phase 4a: dedup removed {} duplicate string(s)",
682            string_remap.len()
683        );
684
685        // Phase 4b: Apply dedup remap to all nodes and pending edges
686        phase4_apply_global_remap(graph.nodes_mut(), &mut all_edges, &string_remap);
687    }
688    tracker.increment_progress(); // 4a+4b done
689
690    // 7c cancellation boundary 6: pre-Phase-4c (rebuild_indices).
691    cancellation.check()?;
692
693    // Phase 4c: Build indices from finalized arena.
694    // Uses build_from_arena() which is O(n log n) — no per-element duplicate check.
695    graph.rebuild_indices();
696    tracker.increment_progress(); // 4c done
697
698    // 7c cancellation boundary 7: pre-Phase-4c-prime
699    // (phase4c_prime_unify_cross_file_nodes).
700    cancellation.check()?;
701
702    // Phase 4c-prime: Cross-file node unification.
703    // Walk the arena for nodes sharing a qualified name and a call-compatible kind,
704    // merge duplicates into a single canonical node, and rewrite PendingEdge targets.
705    // Must run AFTER rebuild_indices (uses by_qualified_name) and BEFORE Phase 4d
706    // (operates on PendingEdge, not committed DeltaEdge).
707    let unification_stats = phase4c_prime_unify_cross_file_nodes(&mut graph, &mut all_edges);
708    if unification_stats.nodes_merged > 0 {
709        log::info!(
710            "Phase 4c-prime: unified {} duplicate nodes ({} candidate groups examined, \
711             {} edges rewritten, {} ms)",
712            unification_stats.nodes_merged,
713            unification_stats.candidate_pairs_examined,
714            unification_stats.edges_rewritten,
715            unification_stats.elapsed_ms,
716        );
717        // 7c cancellation boundary 7b: post-4c-prime, before the
718        // optional second rebuild_indices. Codex iter-0 MAJOR: without
719        // this check, a cancellation observed after the unification
720        // walk still pays another O(n log n) index rebuild.
721        cancellation.check()?;
722        // Rebuild indices after tombstoning loser nodes
723        graph.rebuild_indices();
724    }
725    tracker.increment_progress(); // 4c-prime done
726
727    // 7c cancellation boundary 8: pre-Phase-4d (bulk edge insert).
728    cancellation.check()?;
729
730    // Phase 4d: Bulk insert edges via deterministic DeltaEdge conversion.
731    // Wraps the pure pending_edges_to_delta + add_edges_bulk_ordered pair
732    // behind phase4d_bulk_insert_edges so the incremental rebuild path
733    // (Task 4 Step 4 Phase 3) can reuse the same helper against a
734    // RebuildGraph. The helper carries forward the edge store's current
735    // seq counter so non-empty graphs advance deterministically.
736    let _final_edge_seq = phase4d_bulk_insert_edges(&mut graph, &all_edges);
737    tracker.increment_progress(); // 4d done
738    tracker.complete_phase();
739
740    log::info!(
741        "Parallel indexing complete: {succeeded} committed, {skipped} skipped, \
742         {timed_out} timed out, {parse_errors} parse errors, \
743         ~{} MB total staged, ~{} MB peak chunk (max single file: ~{} KB)",
744        total_staging_bytes / (1024 * 1024),
745        peak_chunk_staging_bytes / (1024 * 1024),
746        max_file_staging_bytes / 1024,
747    );
748
749    let attempted = succeeded + parse_errors + timed_out;
750
751    if attempted == 0 {
752        log::warn!(
753            "No eligible source files found for graph build in {}",
754            root.display()
755        );
756    }
757
758    if attempted > 0 && succeeded == 0 {
759        return Err(GraphBuilderError::Internal {
760            reason: "All graph builds failed".to_string(),
761        });
762    }
763
764    // 7c cancellation boundary 9: pre-Phase-4e (binding plane).
765    cancellation.check()?;
766
767    // ------------------------------------------------------------------
768    // Phase 4e — Binding plane derivation.
769    //
770    // Runs between Phase 4d (bulk edge insert) and Pass 5 (cross-language
771    // linking). Consumes only the language-local edge kinds Contains,
772    // Defines, Imports, Exports. Populates CodeGraph::scope_arena (P2U03),
773    // CodeGraph::alias_table (P2U04), CodeGraph::shadow_table (P2U05), and
774    // CodeGraph::scope_provenance_store (P2U11) in one pass.
775    // ------------------------------------------------------------------
776    tracker.start_phase(5, "Binding plane derivation", 1);
777    let binding_stats = super::phase4e_binding::derive_binding_plane(&mut graph);
778    log::info!(
779        target: "sqry_core::build",
780        "Phase 4e: {} scopes, {} aliases, {} shadows derived",
781        binding_stats.scopes,
782        binding_stats.aliases,
783        binding_stats.shadows,
784    );
785    tracker.increment_progress();
786    tracker.complete_phase();
787
788    // 7c test hook: observation point fired before Pass 5. Tests that
789    // need to flip the cancellation token at this boundary register a
790    // callback here (fires BEFORE the check below so a hook that flips
791    // the token is observed by the subsequent check).
792    #[cfg(any(test, feature = "rebuild-internals"))]
793    testing::fire_before_pass5_hook(cancellation);
794
795    // 7c cancellation boundary 10: pre-Pass-5 (cross-language linking).
796    cancellation.check()?;
797
798    // Pass 5: Cross-language linking (FFI declarations → C/C++ functions, HTTP requests → endpoints)
799    tracker.start_phase(6, "Cross-language linking", 1);
800    let pass5_stats = super::pass5_cross_language::link_cross_language_edges(&mut graph);
801    if pass5_stats.total_edges_created > 0 {
802        log::info!(
803            "Pass 5: {} cross-language edges created ({} FFI, {} HTTP)",
804            pass5_stats.total_edges_created,
805            pass5_stats.ffi_edges_created,
806            pass5_stats.http_endpoints_matched,
807        );
808    }
809    tracker.increment_progress(); // pass 5 done
810    tracker.complete_phase();
811
812    log::info!("Built unified graph with {} nodes", graph.node_count());
813
814    // Publish-boundary invariants (A2 §F / Task 4 Gate 0d).
815    //
816    // This is the canonical "full rebuild end" call site named in plan
817    // §F.3. Full rebuilds have no tombstoned NodeIds to carry forward,
818    // so the §F.2 residue check does not run here — per plan §H step
819    // 14, the residue check has EXACTLY ONE call site
820    // (`RebuildGraph::finalize` step 14) against the drained tombstone
821    // set. Full rebuilds run the §F.1 bucket bijection only, via
822    // [`crate::graph::unified::publish::assert_publish_bijection`]:
823    // every parallel-commit chunk populates per-file buckets via
824    // `FileRegistry::record_node`, and the bijection proves no file
825    // ended up with a dead / duplicate / misfiled / missing node.
826    //
827    // In release builds the helper is a no-op; see `publish.rs`.
828    super::super::publish::assert_publish_bijection(&graph);
829
830    Ok((graph, effective_threads))
831}
832
833/// Build unified graph, persist snapshot + manifest, and run analysis pipeline.
834///
835/// Convenience wrapper that uses a no-op progress reporter.
836/// See [`build_and_persist_graph_with_progress`] for full documentation.
837///
838/// # Errors
839///
840/// Returns an error if graph building, persistence, or analysis fails.
841pub fn build_and_persist_graph(
842    root: &Path,
843    plugins: &PluginManager,
844    config: &BuildConfig,
845    build_command: &str,
846) -> Result<(CodeGraph, BuildResult)> {
847    build_and_persist_graph_with_progress(
848        root,
849        plugins,
850        config,
851        build_command,
852        inferred_plugin_selection_manifest(plugins),
853        no_op_reporter(),
854    )
855}
856
857fn inferred_plugin_selection_manifest(
858    plugins: &PluginManager,
859) -> Option<crate::graph::unified::persistence::PluginSelectionManifest> {
860    let active_plugin_ids = plugins
861        .plugins()
862        .iter()
863        .map(|plugin| plugin.metadata().id.to_string())
864        .collect::<Vec<_>>();
865    if active_plugin_ids.is_empty() {
866        return None;
867    }
868
869    Some(
870        crate::graph::unified::persistence::PluginSelectionManifest {
871            active_plugin_ids,
872            high_cost_mode: None,
873        },
874    )
875}
876
877/// Persist a pre-built graph and run the analysis pipeline.
878///
879/// This is the persist+analysis portion of
880/// [`build_and_persist_graph_with_progress`], extracted so callers can enrich
881/// the graph between build and persist.
882///
883/// # Errors
884///
885/// Returns an error if persistence or analysis fails.
886#[allow(clippy::too_many_lines, clippy::needless_pass_by_value)]
887pub fn persist_and_analyze_graph(
888    graph: CodeGraph,
889    root: &Path,
890    plugins: &PluginManager,
891    config: &BuildConfig,
892    build_command: &str,
893    plugin_selection: Option<crate::graph::unified::persistence::PluginSelectionManifest>,
894    progress: SharedReporter,
895    effective_threads: usize,
896) -> Result<(CodeGraph, BuildResult)> {
897    use crate::graph::unified::analysis::csr::CsrAdjacency;
898    use crate::graph::unified::analysis::{AnalysisIdentity, GraphAnalyses, compute_node_id_hash};
899    use crate::graph::unified::compaction::{Direction, build_compacted_csr, snapshot_edges};
900    use crate::graph::unified::persistence::manifest::write_manifest_bytes_atomic;
901    use crate::graph::unified::persistence::{
902        BuildProvenance, GraphStorage, MANIFEST_SCHEMA_VERSION, Manifest, SNAPSHOT_FORMAT_VERSION,
903        save_to_path,
904    };
905    use crate::progress::IndexProgress;
906    use chrono::Utc;
907    use sha2::{Digest, Sha256};
908
909    // Step 1: Ensure storage directories exist and remove old manifest
910    // Removing the manifest BEFORE writing the new snapshot ensures that
911    // readers see `storage.exists() == false` during the rebuild window.
912    // Without this, an interrupted rebuild (crash after snapshot write but
913    // before manifest write) would leave the old manifest paired with a
914    // new, potentially incompatible snapshot — violating the commit-point
915    // contract.
916    let storage = GraphStorage::new(root);
917    fs::create_dir_all(storage.graph_dir())
918        .with_context(|| format!("Failed to create {}", storage.graph_dir().display()))?;
919
920    if storage.exists() {
921        // Remove old manifest so readers don't see stale readiness.
922        // This MUST succeed before we overwrite the snapshot — otherwise a
923        // crash between snapshot write and manifest write leaves stale
924        // readiness (old manifest + new snapshot).  NotFound is harmless
925        // (race or already cleaned up); any other error is fatal.
926        match fs::remove_file(storage.manifest_path()) {
927            Ok(()) => {}
928            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
929            Err(e) => {
930                return Err(e).with_context(|| {
931                    format!(
932                        "Failed to remove old manifest at {} — rebuild cannot proceed safely",
933                        storage.manifest_path().display()
934                    )
935                });
936            }
937        }
938    }
939
940    // Step 2: Capture raw edge count before compaction changes it
941    let raw_edge_count = graph.edge_count();
942    let node_count = graph.node_count();
943
944    // Step 3: Compact edge stores into CSR before persistence
945    //
946    // The build pipeline inserts all edges into the DeltaBuffer (write-optimized).
947    // Without compaction, the persisted snapshot stores edges in delta, causing
948    // O(N) scans for every edges_from()/edges_to() call on load. Compacting to
949    // CSR gives O(degree) lookups — critical for kernel-scale graphs (22M edges).
950    progress.report(IndexProgress::StageStarted {
951        stage_name: "Compacting edge stores for persistence",
952    });
953    let compaction_start = std::time::Instant::now();
954
955    // Snapshot both edge stores (sequential — holds read locks briefly)
956    let forward_compaction_snapshot = {
957        let forward_store = graph.edges().forward();
958        snapshot_edges(&forward_store, node_count)
959    };
960    let reverse_compaction_snapshot = {
961        let reverse_store = graph.edges().reverse();
962        snapshot_edges(&reverse_store, node_count)
963    };
964
965    // Build both CSRs in parallel (CPU-intensive, no locks held)
966    let (forward_result, reverse_result) = rayon::join(
967        || build_compacted_csr(&forward_compaction_snapshot, Direction::Forward),
968        || build_compacted_csr(&reverse_compaction_snapshot, Direction::Reverse),
969    );
970
971    let (forward_csr, _forward_build_stats) =
972        forward_result.context("Failed to build forward CSR for persistence compaction")?;
973    let (reverse_csr, _reverse_build_stats) =
974        reverse_result.context("Failed to build reverse CSR for persistence compaction")?;
975
976    // Drop snapshots — no longer needed
977    drop(forward_compaction_snapshot);
978    drop(reverse_compaction_snapshot);
979
980    // Build analysis adjacency from forward CSR before it's consumed by swap.
981    // This replaces the expensive build_from_snapshot merge+sort (~11s on kernel).
982    let adjacency = CsrAdjacency::from_csr_graph(&forward_csr);
983
984    // Atomic mutation phase: swap both CSRs and clear both deltas
985    graph
986        .edges()
987        .swap_csrs_and_clear_deltas(forward_csr, reverse_csr);
988
989    progress.report(IndexProgress::StageCompleted {
990        stage_name: "Compacting edge stores for persistence",
991        stage_duration: compaction_start.elapsed(),
992    });
993
994    // Step 4: Save CSR-backed binary snapshot
995    progress.report(IndexProgress::SavingStarted {
996        component_name: "unified graph",
997    });
998    let save_start = std::time::Instant::now();
999
1000    save_to_path(&graph, storage.snapshot_path()).with_context(|| {
1001        format!(
1002            "Failed to save snapshot to {}",
1003            storage.snapshot_path().display()
1004        )
1005    })?;
1006
1007    progress.report(IndexProgress::SavingCompleted {
1008        component_name: "unified graph",
1009        save_duration: save_start.elapsed(),
1010    });
1011
1012    // Step 5: Compute snapshot checksum
1013    let snapshot_content =
1014        fs::read(storage.snapshot_path()).context("Failed to read snapshot for checksum")?;
1015    let snapshot_sha256 = hex::encode(Sha256::digest(&snapshot_content));
1016
1017    // Step 6: Build full analyses from the prebuilt adjacency.
1018    // CsrAdjacency was already derived from the forward CsrGraph in Step 4,
1019    // eliminating the expensive re-merge from CompactionSnapshot.
1020    progress.report(IndexProgress::StageStarted {
1021        stage_name: "Computing graph analyses",
1022    });
1023    let analysis_start = std::time::Instant::now();
1024
1025    let analyses = if let Some(thread_count) = config.num_threads {
1026        rayon::ThreadPoolBuilder::new()
1027            .num_threads(thread_count)
1028            .build()
1029            .context("Failed to create rayon thread pool for graph analysis")?
1030            .install(|| {
1031                GraphAnalyses::build_all_from_adjacency_with_budget(adjacency, &config.label_budget)
1032            })
1033    } else {
1034        GraphAnalyses::build_all_from_adjacency_with_budget(adjacency, &config.label_budget)
1035    }
1036    .context("Failed to build graph analyses")?;
1037
1038    progress.report(IndexProgress::StageCompleted {
1039        stage_name: "Computing graph analyses",
1040        stage_duration: analysis_start.elapsed(),
1041    });
1042
1043    let dedup_edge_count = analyses.adjacency.edge_count as usize;
1044
1045    let analysis_strategies = vec![
1046        AnalysisStrategySummary {
1047            edge_kind: "calls",
1048            strategy: analyses.cond_calls.strategy,
1049        },
1050        AnalysisStrategySummary {
1051            edge_kind: "imports",
1052            strategy: analyses.cond_imports.strategy,
1053        },
1054        AnalysisStrategySummary {
1055            edge_kind: "references",
1056            strategy: analyses.cond_references.strategy,
1057        },
1058        AnalysisStrategySummary {
1059            edge_kind: "inherits",
1060            strategy: analyses.cond_inherits.strategy,
1061        },
1062    ];
1063
1064    // Step 7: Count workspace files by language using plugin detection
1065    let mut file_counts: std::collections::HashMap<String, usize> =
1066        std::collections::HashMap::new();
1067    for (file_id, file_path) in graph.indexed_files() {
1068        if graph.files().is_external(file_id) {
1069            continue;
1070        }
1071        let language = plugins
1072            .plugin_for_path(file_path)
1073            .map_or_else(|| "unknown".to_string(), |p| p.metadata().id.to_string());
1074        *file_counts.entry(language).or_insert(0) += 1;
1075    }
1076    let total_files: usize = file_counts.values().sum();
1077
1078    // Step 8: Construct Manifest in memory (with dedup edge count from analysis)
1079    let built_at = Utc::now().to_rfc3339();
1080
1081    let manifest = Manifest {
1082        schema_version: MANIFEST_SCHEMA_VERSION,
1083        snapshot_format_version: SNAPSHOT_FORMAT_VERSION,
1084        built_at: built_at.clone(),
1085        root_path: root.to_string_lossy().to_string(),
1086        node_count,
1087        edge_count: dedup_edge_count,
1088        raw_edge_count: Some(raw_edge_count),
1089        snapshot_sha256,
1090        build_provenance: BuildProvenance {
1091            sqry_version: env!("CARGO_PKG_VERSION").to_string(),
1092            build_timestamp: built_at.clone(),
1093            build_command: build_command.to_string(),
1094            plugin_hashes: std::collections::HashMap::default(),
1095        },
1096        file_count: file_counts.clone(),
1097        languages: Vec::default(),
1098        config: std::collections::HashMap::default(),
1099        confidence: graph.confidence().clone(),
1100        last_indexed_commit: get_git_head_commit(root),
1101        plugin_selection: plugin_selection.clone(),
1102    };
1103
1104    // Step 9: Serialize manifest to bytes and compute hash
1105    let manifest_bytes =
1106        serde_json::to_vec_pretty(&manifest).context("Failed to serialize manifest")?;
1107
1108    let manifest_hash = {
1109        let mut hasher = Sha256::new();
1110        hasher.update(&manifest_bytes);
1111        hex::encode(hasher.finalize())
1112    };
1113
1114    // Step 10: Construct AnalysisIdentity and persist all analyses
1115    let snapshot = graph.snapshot();
1116    let node_id_hash = compute_node_id_hash(&snapshot);
1117    let identity = AnalysisIdentity::new(manifest_hash, node_id_hash);
1118
1119    fs::create_dir_all(storage.analysis_dir()).with_context(|| {
1120        format!(
1121            "Failed to create analysis directory at {}",
1122            storage.analysis_dir().display()
1123        )
1124    })?;
1125
1126    progress.report(IndexProgress::SavingStarted {
1127        component_name: "graph analyses",
1128    });
1129
1130    analyses
1131        .persist_all(&storage, &identity)
1132        .context("Failed to persist graph analyses")?;
1133
1134    log::info!(
1135        "Graph analyses persisted to {}",
1136        storage.analysis_dir().display()
1137    );
1138
1139    progress.report(IndexProgress::SavingCompleted {
1140        component_name: "graph analyses",
1141        save_duration: analysis_start.elapsed(),
1142    });
1143
1144    // Step 11: Write manifest bytes to disk LAST (commit point)
1145    write_manifest_bytes_atomic(storage.manifest_path(), &manifest_bytes).with_context(|| {
1146        format!(
1147            "Failed to save manifest to {}",
1148            storage.manifest_path().display()
1149        )
1150    })?;
1151
1152    log::info!(
1153        "Manifest saved to {} (dedup edges: {}, raw edges: {})",
1154        storage.manifest_path().display(),
1155        dedup_edge_count,
1156        raw_edge_count
1157    );
1158
1159    let build_result = BuildResult {
1160        node_count,
1161        edge_count: dedup_edge_count,
1162        raw_edge_count,
1163        file_count: file_counts,
1164        total_files,
1165        built_at,
1166        root_path: root.to_string_lossy().to_string(),
1167        thread_count: effective_threads,
1168        active_plugin_ids: plugin_selection
1169            .map_or_else(Vec::new, |selection| selection.active_plugin_ids),
1170        analysis_strategies,
1171    };
1172
1173    Ok((graph, build_result))
1174}
1175
1176/// Build unified graph with progress, persist snapshot + manifest, and run analysis.
1177///
1178/// This is the single entry point for building a complete graph index. It combines:
1179/// 1. Graph building from source files (with progress reporting)
1180/// 2. Snapshot persistence (binary format)
1181/// 3. Analysis pipeline (CSR + SCC + Condensation DAG + labels/fallback) — strict, fails on error
1182/// 4. Manifest creation with deduplicated edge count (JSON metadata, written LAST)
1183///
1184/// The manifest is the "commit point" — written last, only after all other artifacts
1185/// succeed. Consumers check `storage.exists()` (manifest-based) for index readiness.
1186///
1187/// # Arguments
1188///
1189/// * `root` - Root directory to scan for source files
1190/// * `plugins` - Plugin manager for language-specific extraction
1191/// * `config` - Build configuration
1192/// * `build_command` - Provenance string (e.g., `"cli:index"`, `"mcp:rebuild_index"`)
1193/// * `progress` - Progress reporter for build status updates
1194///
1195/// # Errors
1196///
1197/// Returns an error if graph building, persistence, or analysis fails.
1198/// Analysis failure is strict — no fallback to raw edge counts.
1199#[allow(clippy::too_many_lines, clippy::needless_pass_by_value)]
1200pub fn build_and_persist_graph_with_progress(
1201    root: &Path,
1202    plugins: &PluginManager,
1203    config: &BuildConfig,
1204    build_command: &str,
1205    plugin_selection: Option<crate::graph::unified::persistence::PluginSelectionManifest>,
1206    progress: SharedReporter,
1207) -> Result<(CodeGraph, BuildResult)> {
1208    let (graph, effective_threads) = build_unified_graph_inner(
1209        root,
1210        plugins,
1211        config,
1212        progress.clone(),
1213        &CancellationToken::default(),
1214    )
1215    .map_err(anyhow::Error::from)?;
1216    persist_and_analyze_graph(
1217        graph,
1218        root,
1219        plugins,
1220        config,
1221        build_command,
1222        plugin_selection,
1223        progress,
1224        effective_threads,
1225    )
1226}
1227
1228/// Get the current HEAD commit SHA from a git repository.
1229#[must_use]
1230pub fn get_git_head_commit(path: &Path) -> Option<String> {
1231    let output = std::process::Command::new("git")
1232        .arg("-C")
1233        .arg(path)
1234        .args(["rev-parse", "HEAD"])
1235        .output()
1236        .ok()?;
1237
1238    if output.status.success() {
1239        let sha = String::from_utf8_lossy(&output.stdout).trim().to_string();
1240        if sha.len() == 40 && sha.chars().all(|c| c.is_ascii_hexdigit()) {
1241            return Some(sha);
1242        }
1243    }
1244    None
1245}
1246
1247/// Find source files in the given directory.
1248///
1249/// Uses the `ignore` crate to respect `.gitignore` files and standard ignore patterns.
1250fn find_source_files(root: &Path, config: &BuildConfig) -> Vec<std::path::PathBuf> {
1251    let mut builder = WalkBuilder::new(root);
1252
1253    builder
1254        .follow_links(config.follow_links)
1255        .hidden(!config.include_hidden)
1256        .git_ignore(true)
1257        .git_global(true)
1258        .git_exclude(true);
1259
1260    if let Some(depth) = config.max_depth {
1261        builder.max_depth(Some(depth));
1262    }
1263
1264    if let Some(threads) = config.num_threads {
1265        builder.threads(threads);
1266    }
1267
1268    let root_for_filter = root.to_path_buf();
1269    builder.filter_entry(move |entry| {
1270        entry
1271            .file_type()
1272            .is_none_or(|file_type| !file_type.is_dir())
1273            || should_visit_source_dir(&root_for_filter, entry.path())
1274    });
1275
1276    let mut files = Vec::new();
1277
1278    for entry in builder.build() {
1279        let entry = match entry {
1280            Ok(entry) => entry,
1281            Err(err) => {
1282                log::warn!("Failed to read directory entry: {err}");
1283                continue;
1284            }
1285        };
1286
1287        if entry.file_type().is_some_and(|ft| ft.is_file()) {
1288            files.push(entry.into_path());
1289        }
1290    }
1291
1292    files
1293}
1294
1295fn should_visit_source_dir(root: &Path, path: &Path) -> bool {
1296    if path == root {
1297        return true;
1298    }
1299
1300    let Some(name) = path.file_name().and_then(|value| value.to_str()) else {
1301        return true;
1302    };
1303
1304    !is_default_excluded_source_dir(name)
1305}
1306
1307fn is_default_excluded_source_dir(name: &str) -> bool {
1308    if std::env::var("SQRY_INCLUDE_DEFAULT_EXCLUDED_DIRS")
1309        .is_ok_and(|value| value == "1" || value.eq_ignore_ascii_case("true"))
1310    {
1311        return false;
1312    }
1313
1314    DEFAULT_EXCLUDED_SOURCE_DIRS.contains(&name)
1315        || DEFAULT_EXCLUDED_SOURCE_DIR_PREFIXES
1316            .iter()
1317            .any(|prefix| name.starts_with(prefix))
1318}
1319
1320fn sort_files_for_build(root: &Path, files: &mut [PathBuf]) {
1321    let normalized_root = normalize_path_components(root);
1322    files.sort_by(|left, right| {
1323        let left_key = file_sort_key(&normalized_root, left);
1324        let right_key = file_sort_key(&normalized_root, right);
1325        left_key.cmp(&right_key).then_with(|| left.cmp(right))
1326    });
1327}
1328
1329fn file_sort_key(root: &Path, path: &Path) -> String {
1330    let normalized_path = normalize_path_components(path);
1331    let relative = normalized_path
1332        .strip_prefix(root)
1333        .unwrap_or(normalized_path.as_path());
1334    let mut key = relative.to_string_lossy().replace('\\', "/");
1335    if cfg!(windows) {
1336        key = key.to_ascii_lowercase();
1337    }
1338    key
1339}
1340
1341/// Result of successfully parsing a single file (parallel-safe, no shared state).
1342///
1343/// `pub(super)` so sibling modules in `crate::graph::unified::build`
1344/// (specifically [`super::incremental`] from Task 4 Step 4 Phase 3c onward)
1345/// can construct and consume `ParsedFile` values when driving the
1346/// parse → commit pipeline against a `RebuildGraph`. The type stays
1347/// crate-private: external callers still route through the higher-level
1348/// `build_unified_graph` / `incremental_rebuild` entrypoints.
1349#[derive(Debug)]
1350pub(super) struct ParsedFile {
1351    /// Language identifier for file counting and confidence merging.
1352    pub(super) language: crate::graph::Language,
1353    /// Staged graph operations ready for serial commit.
1354    pub(super) staging: StagingGraph,
1355}
1356
1357/// Outcome of [`parse_file`]. `pub(super)` for the same reason as
1358/// [`ParsedFile`] — shared with [`super::incremental`]'s re-parse closure
1359/// driver in Phase 3c+. Still crate-private.
1360#[derive(Debug)]
1361pub(super) enum ParsedFileOutcome {
1362    Parsed(ParsedFile),
1363    Skipped,
1364    TimedOut {
1365        file: PathBuf,
1366        phase: &'static str,
1367        timeout_ms: u64,
1368    },
1369}
1370
1371/// Parse a single file into a `StagingGraph` without touching the shared graph.
1372///
1373/// This function is safe to call from multiple threads — it creates its own
1374/// parser, reads the file, and builds a self-contained staging graph.
1375///
1376/// Returns [`ParsedFileOutcome::Skipped`] if the file has no matching plugin or graph builder.
1377///
1378/// `pub(super)` as of Task 4 Step 4 Phase 3c so the sibling
1379/// [`super::incremental`] module can re-parse closure files against the
1380/// rebuild-local `GraphMutationTarget` plane during `incremental_rebuild`.
1381pub(super) fn parse_file(path: &Path, plugins: &PluginManager) -> Result<ParsedFileOutcome> {
1382    let plugin = plugins.plugin_for_path(path);
1383    let Some(plugin) = plugin else {
1384        return Ok(ParsedFileOutcome::Skipped);
1385    };
1386
1387    let Some(builder) = plugin.graph_builder() else {
1388        return Ok(ParsedFileOutcome::Skipped);
1389    };
1390
1391    let reader =
1392        FileReader::open(path).with_context(|| format!("failed to read {}", path.display()))?;
1393    let raw_content = reader.as_slice();
1394
1395    let safe_parser = SafeParser::new(SafeParserConfig::new().with_max_input_size(
1396        usize::try_from(crate::config::buffers::max_source_file_size()).unwrap_or(usize::MAX),
1397    ));
1398    let prepared_content = plugin.preprocess(raw_content);
1399    let parse_content = prepared_content.as_ref();
1400    let parse_start = Instant::now();
1401    let tree = safe_parser
1402        .parse_file(&plugin.language(), parse_content, path)
1403        .map_err(|err| map_parse_error(path, err))?;
1404    let parse_duration = parse_start.elapsed();
1405    if parse_duration >= Duration::from_secs(2) {
1406        log::warn!("Slow parse ({parse_duration:.2?}): {}", path.display());
1407    }
1408
1409    let mut staging = StagingGraph::new();
1410    let build_start = Instant::now();
1411    match builder.build_graph(&tree, parse_content, path, &mut staging) {
1412        Ok(()) => {}
1413        Err(GraphBuilderError::BuildTimedOut {
1414            phase, timeout_ms, ..
1415        }) => {
1416            return Ok(ParsedFileOutcome::TimedOut {
1417                file: path.to_path_buf(),
1418                phase,
1419                timeout_ms,
1420            });
1421        }
1422        Err(err) => return Err(map_builder_error(path, &err)),
1423    }
1424    let build_duration = build_start.elapsed();
1425    if build_duration >= Duration::from_secs(2) {
1426        log::warn!(
1427            "Slow graph build ({build_duration:.2?}): {}",
1428            path.display()
1429        );
1430    }
1431
1432    staging.attach_body_hashes(raw_content);
1433
1434    Ok(ParsedFileOutcome::Parsed(ParsedFile {
1435        language: builder.language(),
1436        staging,
1437    }))
1438}
1439
1440fn map_parse_error(path: &Path, err: ParseError) -> anyhow::Error {
1441    match err {
1442        ParseError::TreeSitterFailed => {
1443            anyhow::anyhow!("tree-sitter failed to parse {}", path.display())
1444        }
1445        ParseError::LanguageSetFailed(reason) => anyhow::anyhow!(
1446            "failed to configure tree-sitter for {}: {}",
1447            path.display(),
1448            reason
1449        ),
1450        ParseError::InputTooLarge { size, max, .. } => anyhow::anyhow!(
1451            "input too large for {}: {} bytes exceeds {} byte parser limit",
1452            path.display(),
1453            size,
1454            max
1455        ),
1456        ParseError::ParseTimedOut { timeout_micros, .. } => anyhow::anyhow!(
1457            "parse timed out for {} after {} ms",
1458            path.display(),
1459            timeout_micros / 1000
1460        ),
1461        ParseError::ParseCancelled { reason, .. } => {
1462            anyhow::anyhow!("parse cancelled for {}: {}", path.display(), reason)
1463        }
1464        _ => anyhow::anyhow!("parse error in {}: {:?}", path.display(), err),
1465    }
1466}
1467
1468fn map_builder_error(path: &Path, err: &GraphBuilderError) -> anyhow::Error {
1469    anyhow::anyhow!("graph builder error in {}: {}", path.display(), err)
1470}
1471
1472// ---------------------------------------------------------------------------
1473// Test-only hooks (Task 7 Phase 7c)
1474// ---------------------------------------------------------------------------
1475//
1476// Thread-local callbacks fired at pass boundaries inside
1477// `build_unified_graph_inner`. Tests that need to flip the
1478// `CancellationToken` between chunks / before Phase 4 / before Pass 5
1479// install a hook, trigger a rebuild, and observe the pipeline
1480// short-circuit.
1481//
1482// Follows the same pattern as [`incremental::testing`] (see
1483// `incremental.rs:1605`): the module is gated on
1484// `any(test, feature = "rebuild-internals")` and production builds
1485// compile every call site into `let _ = ...;` no-ops.
1486/// Test-only hooks exposed so `sqry-daemon` integration tests can
1487/// drive cancellation-boundary scenarios in `build_unified_graph_inner`
1488/// without reaching into private module state.
1489///
1490/// Gated on `any(test, feature = "rebuild-internals")`; production
1491/// builds compile the module out.
1492#[cfg(any(test, feature = "rebuild-internals"))]
1493pub mod testing {
1494    use super::CancellationToken;
1495    use std::cell::RefCell;
1496
1497    /// Callback invoked at the top of each chunk iteration in
1498    /// `build_unified_graph_inner`, receiving the current cancellation
1499    /// token. Tests typically call `token.cancel()` after N chunks to
1500    /// assert the pipeline short-circuits at the next boundary.
1501    pub type AfterChunkHook = Box<dyn FnMut(&CancellationToken)>;
1502    /// Callback invoked once after the chunk loop exits and before
1503    /// Phase 4 finalization.
1504    pub type BeforePhase4Hook = Box<dyn FnMut(&CancellationToken)>;
1505    /// Callback invoked once before Pass 5 cross-language linking.
1506    pub type BeforePass5Hook = Box<dyn FnMut(&CancellationToken)>;
1507
1508    thread_local! {
1509        static AFTER_CHUNK_HOOK: RefCell<Option<AfterChunkHook>> = const { RefCell::new(None) };
1510        static BEFORE_PHASE4_HOOK: RefCell<Option<BeforePhase4Hook>> = const { RefCell::new(None) };
1511        static BEFORE_PASS5_HOOK: RefCell<Option<BeforePass5Hook>> = const { RefCell::new(None) };
1512    }
1513
1514    /// Install a callback that runs at the top of each chunk iteration.
1515    /// Replaces any previously-installed hook on the current thread.
1516    pub fn set_after_chunk_hook<F>(hook: F) -> Option<AfterChunkHook>
1517    where
1518        F: FnMut(&CancellationToken) + 'static,
1519    {
1520        AFTER_CHUNK_HOOK.with(|cell| cell.replace(Some(Box::new(hook))))
1521    }
1522
1523    /// Remove the currently-installed after-chunk hook. Idempotent.
1524    pub fn clear_after_chunk_hook() {
1525        AFTER_CHUNK_HOOK.with(|cell| {
1526            let _ = cell.replace(None);
1527        });
1528    }
1529
1530    /// Install a callback that runs after the chunk loop exits, before
1531    /// Phase 4 finalization. Replaces any previously-installed hook.
1532    pub fn set_before_phase4_hook<F>(hook: F) -> Option<BeforePhase4Hook>
1533    where
1534        F: FnMut(&CancellationToken) + 'static,
1535    {
1536        BEFORE_PHASE4_HOOK.with(|cell| cell.replace(Some(Box::new(hook))))
1537    }
1538
1539    /// Remove the currently-installed before-Phase-4 hook. Idempotent.
1540    pub fn clear_before_phase4_hook() {
1541        BEFORE_PHASE4_HOOK.with(|cell| {
1542            let _ = cell.replace(None);
1543        });
1544    }
1545
1546    /// Install a callback that runs before Pass 5 cross-language linking.
1547    /// Replaces any previously-installed hook.
1548    pub fn set_before_pass5_hook<F>(hook: F) -> Option<BeforePass5Hook>
1549    where
1550        F: FnMut(&CancellationToken) + 'static,
1551    {
1552        BEFORE_PASS5_HOOK.with(|cell| cell.replace(Some(Box::new(hook))))
1553    }
1554
1555    /// Remove the currently-installed before-Pass-5 hook. Idempotent.
1556    pub fn clear_before_pass5_hook() {
1557        BEFORE_PASS5_HOOK.with(|cell| {
1558            let _ = cell.replace(None);
1559        });
1560    }
1561
1562    /// Fire the installed after-chunk hook (if any). Called from
1563    /// `build_unified_graph_inner` at the top of every chunk iteration.
1564    pub(super) fn fire_after_chunk_hook(cancellation: &CancellationToken) {
1565        AFTER_CHUNK_HOOK.with(|cell| {
1566            if let Some(hook) = cell.borrow_mut().as_mut() {
1567                hook(cancellation);
1568            }
1569        });
1570    }
1571
1572    /// Fire the installed before-Phase-4 hook (if any).
1573    pub(super) fn fire_before_phase4_hook(cancellation: &CancellationToken) {
1574        BEFORE_PHASE4_HOOK.with(|cell| {
1575            if let Some(hook) = cell.borrow_mut().as_mut() {
1576                hook(cancellation);
1577            }
1578        });
1579    }
1580
1581    /// Fire the installed before-Pass-5 hook (if any).
1582    pub(super) fn fire_before_pass5_hook(cancellation: &CancellationToken) {
1583        BEFORE_PASS5_HOOK.with(|cell| {
1584            if let Some(hook) = cell.borrow_mut().as_mut() {
1585                hook(cancellation);
1586            }
1587        });
1588    }
1589
1590    /// RAII guard that installs an after-chunk hook on construction
1591    /// and clears it on drop. Prevents a panic mid-test from leaking
1592    /// a hook into a sibling test on the same thread.
1593    pub struct AfterChunkHookGuard {
1594        _sealed: (),
1595    }
1596
1597    impl AfterChunkHookGuard {
1598        /// Install `hook` as the thread-local after-chunk callback.
1599        pub fn install<F>(hook: F) -> Self
1600        where
1601            F: FnMut(&CancellationToken) + 'static,
1602        {
1603            let _previous = set_after_chunk_hook(hook);
1604            Self { _sealed: () }
1605        }
1606    }
1607
1608    impl Drop for AfterChunkHookGuard {
1609        fn drop(&mut self) {
1610            clear_after_chunk_hook();
1611        }
1612    }
1613
1614    /// RAII guard that installs a before-Phase-4 hook on construction
1615    /// and clears it on drop.
1616    pub struct BeforePhase4HookGuard {
1617        _sealed: (),
1618    }
1619
1620    impl BeforePhase4HookGuard {
1621        /// Install `hook` as the thread-local before-Phase-4 callback.
1622        pub fn install<F>(hook: F) -> Self
1623        where
1624            F: FnMut(&CancellationToken) + 'static,
1625        {
1626            let _previous = set_before_phase4_hook(hook);
1627            Self { _sealed: () }
1628        }
1629    }
1630
1631    impl Drop for BeforePhase4HookGuard {
1632        fn drop(&mut self) {
1633            clear_before_phase4_hook();
1634        }
1635    }
1636
1637    /// RAII guard that installs a before-Pass-5 hook on construction
1638    /// and clears it on drop.
1639    pub struct BeforePass5HookGuard {
1640        _sealed: (),
1641    }
1642
1643    impl BeforePass5HookGuard {
1644        /// Install `hook` as the thread-local before-Pass-5 callback.
1645        pub fn install<F>(hook: F) -> Self
1646        where
1647            F: FnMut(&CancellationToken) + 'static,
1648        {
1649            let _previous = set_before_pass5_hook(hook);
1650            Self { _sealed: () }
1651        }
1652    }
1653
1654    impl Drop for BeforePass5HookGuard {
1655        fn drop(&mut self) {
1656            clear_before_pass5_hook();
1657        }
1658    }
1659}
1660
1661#[cfg(test)]
1662mod tests {
1663    use super::*;
1664    use crate::ast::Scope;
1665    use crate::graph::{GraphBuilder, GraphBuilderError, GraphResult, Language};
1666    use crate::plugin::error::{ParseError, ScopeError};
1667    use crate::plugin::{LanguageMetadata, LanguagePlugin};
1668    use serial_test::serial;
1669    use std::fs;
1670    use std::path::{Path, PathBuf};
1671    use tempfile::TempDir;
1672    use tree_sitter::{Parser, Tree};
1673
1674    const RUST_TEST_EXTENSIONS: &[&str] = &["rs"];
1675    const FILENAME_MATCH_EXTENSIONS: &[&str] = &["rmd", "bash_profile"];
1676
1677    /// Test helper: commit a single parsed file to a graph using the serial path.
1678    ///
1679    /// This is only used in tests to verify parse-and-commit without running the
1680    /// full parallel pipeline. It replicates the old `commit_staged_file` logic.
1681    fn commit_parsed_file_for_test(path: &Path, mut parsed: ParsedFile, graph: &mut CodeGraph) {
1682        let file_id = graph
1683            .files_mut()
1684            .register_with_language(path, Some(parsed.language))
1685            .expect("register file");
1686        parsed.staging.apply_file_id(file_id);
1687        let string_remap = parsed
1688            .staging
1689            .commit_strings(graph.strings_mut())
1690            .expect("commit strings");
1691        parsed
1692            .staging
1693            .apply_string_remap(&string_remap)
1694            .expect("apply string remap");
1695        let node_id_mapping = parsed
1696            .staging
1697            .commit_nodes(graph.nodes_mut())
1698            .expect("commit nodes");
1699        let edges = parsed.staging.get_remapped_edges(&node_id_mapping);
1700        for edge in edges {
1701            graph.edges_mut().add_edge_with_spans(
1702                edge.source,
1703                edge.target,
1704                edge.kind.clone(),
1705                file_id,
1706                edge.spans.clone(),
1707            );
1708        }
1709    }
1710
1711    fn expect_parsed_file(outcome: ParsedFileOutcome) -> ParsedFile {
1712        match outcome {
1713            ParsedFileOutcome::Parsed(parsed) => parsed,
1714            ParsedFileOutcome::Skipped => panic!("expected parsed file, got skipped outcome"),
1715            ParsedFileOutcome::TimedOut { file, phase, .. } => {
1716                panic!(
1717                    "expected parsed file, got timeout outcome for {} during {}",
1718                    file.display(),
1719                    phase,
1720                )
1721            }
1722        }
1723    }
1724
1725    fn parse_rust_ast(content: &[u8]) -> Result<Tree, ParseError> {
1726        let mut parser = Parser::new();
1727        let language = tree_sitter_rust::LANGUAGE.into();
1728        parser
1729            .set_language(&language)
1730            .map_err(|err| ParseError::LanguageSetFailed(err.to_string()))?;
1731        parser
1732            .parse(content, None)
1733            .ok_or(ParseError::TreeSitterFailed)
1734    }
1735
1736    struct TestPlugin {
1737        metadata: LanguageMetadata,
1738        extensions: &'static [&'static str],
1739        builder: Option<Box<dyn GraphBuilder>>,
1740    }
1741
1742    impl TestPlugin {
1743        fn new(
1744            id: &'static str,
1745            extensions: &'static [&'static str],
1746            builder: Option<Box<dyn GraphBuilder>>,
1747        ) -> Self {
1748            Self {
1749                metadata: LanguageMetadata {
1750                    id,
1751                    name: "Rust",
1752                    version: "test",
1753                    author: "sqry-core tests",
1754                    description: "Test-only Rust plugin for unified graph entrypoint tests",
1755                    tree_sitter_version: "0.25",
1756                },
1757                extensions,
1758                builder,
1759            }
1760        }
1761    }
1762
1763    impl LanguagePlugin for TestPlugin {
1764        fn metadata(&self) -> LanguageMetadata {
1765            self.metadata.clone()
1766        }
1767
1768        fn extensions(&self) -> &'static [&'static str] {
1769            self.extensions
1770        }
1771
1772        fn language(&self) -> tree_sitter::Language {
1773            tree_sitter_rust::LANGUAGE.into()
1774        }
1775
1776        fn parse_ast(&self, content: &[u8]) -> Result<Tree, ParseError> {
1777            parse_rust_ast(content)
1778        }
1779
1780        fn extract_scopes(
1781            &self,
1782            _tree: &Tree,
1783            _content: &[u8],
1784            _file_path: &Path,
1785        ) -> Result<Vec<Scope>, ScopeError> {
1786            Ok(Vec::new())
1787        }
1788
1789        fn graph_builder(&self) -> Option<&dyn crate::graph::GraphBuilder> {
1790            self.builder.as_deref()
1791        }
1792    }
1793
1794    struct FailingGraphBuilder;
1795
1796    impl GraphBuilder for FailingGraphBuilder {
1797        fn build_graph(
1798            &self,
1799            _tree: &Tree,
1800            _content: &[u8],
1801            _file: &Path,
1802            _staging: &mut StagingGraph,
1803        ) -> GraphResult<()> {
1804            Err(GraphBuilderError::CrossLanguageError {
1805                reason: "forced failure".to_string(),
1806            })
1807        }
1808
1809        fn language(&self) -> Language {
1810            Language::Rust
1811        }
1812    }
1813
1814    struct NoopGraphBuilder;
1815
1816    impl GraphBuilder for NoopGraphBuilder {
1817        fn build_graph(
1818            &self,
1819            _tree: &Tree,
1820            _content: &[u8],
1821            _file: &Path,
1822            _staging: &mut StagingGraph,
1823        ) -> GraphResult<()> {
1824            Ok(())
1825        }
1826
1827        fn language(&self) -> Language {
1828            Language::Rust
1829        }
1830    }
1831
1832    struct TimeoutGraphBuilder;
1833
1834    impl GraphBuilder for TimeoutGraphBuilder {
1835        fn build_graph(
1836            &self,
1837            _tree: &Tree,
1838            _content: &[u8],
1839            file: &Path,
1840            _staging: &mut StagingGraph,
1841        ) -> GraphResult<()> {
1842            Err(GraphBuilderError::BuildTimedOut {
1843                file: file.to_path_buf(),
1844                phase: "test-timeout",
1845                timeout_ms: 42,
1846            })
1847        }
1848
1849        fn language(&self) -> Language {
1850            Language::Rust
1851        }
1852    }
1853
1854    struct SelectiveTimeoutGraphBuilder;
1855
1856    impl GraphBuilder for SelectiveTimeoutGraphBuilder {
1857        fn build_graph(
1858            &self,
1859            _tree: &Tree,
1860            _content: &[u8],
1861            file: &Path,
1862            staging: &mut StagingGraph,
1863        ) -> GraphResult<()> {
1864            use crate::graph::unified::build::helper::GraphBuildHelper;
1865
1866            let mut helper = GraphBuildHelper::new(staging, file, Language::Rust);
1867            let file_name = file
1868                .file_name()
1869                .and_then(|value| value.to_str())
1870                .unwrap_or_default();
1871
1872            if file_name == "timeout.rs" {
1873                helper.add_function("timeout_partial", None, false, false);
1874                return Err(GraphBuilderError::BuildTimedOut {
1875                    file: file.to_path_buf(),
1876                    phase: "test-timeout",
1877                    timeout_ms: 42,
1878                });
1879            }
1880
1881            helper.add_function("survivor_fn", None, false, false);
1882            Ok(())
1883        }
1884
1885        fn language(&self) -> Language {
1886            Language::Rust
1887        }
1888    }
1889
1890    #[test]
1891    fn test_build_config_default() {
1892        let config = BuildConfig::default();
1893        assert_eq!(config.max_depth, None);
1894        assert!(!config.follow_links);
1895        assert!(!config.include_hidden);
1896        assert_eq!(config.num_threads, None);
1897    }
1898
1899    #[test]
1900    #[serial]
1901    fn test_find_source_files_excludes_generated_dependency_roots() {
1902        let temp_dir = TempDir::new().expect("temp dir");
1903        let root = temp_dir.path();
1904
1905        fs::write(root.join("src.rs"), "fn src() {}").expect("write source file");
1906        for dir in [
1907            "_work",
1908            "_actions",
1909            "_update",
1910            "externals.2.334.0",
1911            "node_modules",
1912            "target",
1913            "vendor",
1914        ] {
1915            let nested = root.join(dir).join("nested");
1916            fs::create_dir_all(&nested).expect("create excluded dir");
1917            fs::write(nested.join("ignored.rs"), "fn ignored() {}")
1918                .expect("write ignored source file");
1919        }
1920        for dir in ["external_tools", "vendorized"] {
1921            let nested = root.join(dir).join("nested");
1922            fs::create_dir_all(&nested).expect("create included sibling dir");
1923            fs::write(nested.join("included.rs"), "fn included() {}")
1924                .expect("write included source file");
1925        }
1926
1927        let config = BuildConfig::default();
1928        let mut relative_files: Vec<_> = find_source_files(root, &config)
1929            .iter()
1930            .map(|path| path.strip_prefix(root).expect("strip root").to_path_buf())
1931            .collect();
1932        relative_files.sort();
1933
1934        assert_eq!(
1935            relative_files,
1936            vec![
1937                PathBuf::from("external_tools/nested/included.rs"),
1938                PathBuf::from("src.rs"),
1939                PathBuf::from("vendorized/nested/included.rs"),
1940            ]
1941        );
1942    }
1943
1944    #[test]
1945    #[serial]
1946    fn test_find_source_files_can_include_default_excluded_roots() {
1947        let temp_dir = TempDir::new().expect("temp dir");
1948        let root = temp_dir.path();
1949        let nested = root.join("vendor").join("first_party");
1950        fs::create_dir_all(&nested).expect("create vendor dir");
1951        fs::write(nested.join("included.rs"), "fn included() {}").expect("write included source");
1952
1953        unsafe {
1954            std::env::set_var("SQRY_INCLUDE_DEFAULT_EXCLUDED_DIRS", "1");
1955        }
1956        let config = BuildConfig::default();
1957        let files = find_source_files(root, &config);
1958        unsafe {
1959            std::env::remove_var("SQRY_INCLUDE_DEFAULT_EXCLUDED_DIRS");
1960        }
1961
1962        let relative_files: Vec<_> = files
1963            .iter()
1964            .map(|path| path.strip_prefix(root).expect("strip root").to_path_buf())
1965            .collect();
1966
1967        assert_eq!(
1968            relative_files,
1969            vec![PathBuf::from("vendor/first_party/included.rs")]
1970        );
1971    }
1972
1973    #[test]
1974    fn test_build_unified_graph_empty_registry_error() {
1975        let plugins = PluginManager::new();
1976        let config = BuildConfig::default();
1977        let root = std::path::Path::new(".");
1978
1979        let result = build_unified_graph(root, &plugins, &config);
1980        let err = result.expect_err("empty registry must error");
1981        // Task 7 Phase 7c: the internal pipeline now returns
1982        // `GraphBuilderError::Internal { reason }` instead of a bare
1983        // `anyhow::bail!`. The legacy `build_unified_graph` wrapper
1984        // lifts through `anyhow::Error::from`, which prefixes the
1985        // reason with the `GraphBuilderError::Internal` `Display`
1986        // string (`Internal graph builder error: ...`).
1987        assert_eq!(
1988            err.to_string(),
1989            "Internal graph builder error: No graph builders registered – cannot build code graph"
1990        );
1991    }
1992
1993    #[test]
1994    fn test_build_unified_graph_no_graph_builders_error() {
1995        let mut plugins = PluginManager::new();
1996        plugins.register_builtin(Box::new(TestPlugin::new(
1997            "rust-no-graph-builder",
1998            RUST_TEST_EXTENSIONS,
1999            None,
2000        )));
2001        let config = BuildConfig::default();
2002        let root = std::path::Path::new(".");
2003
2004        let result = build_unified_graph(root, &plugins, &config);
2005        let err = result.expect_err("no graph builders must error");
2006        assert_eq!(
2007            err.to_string(),
2008            "Internal graph builder error: No graph builders registered – cannot build code graph"
2009        );
2010    }
2011
2012    #[test]
2013    fn test_build_unified_graph_all_failures_error() {
2014        let temp_dir = TempDir::new().expect("temp dir");
2015        let file_path = temp_dir.path().join("fail.rs");
2016        fs::write(&file_path, "fn main() {}").expect("write test file");
2017
2018        let mut plugins = PluginManager::new();
2019        plugins.register_builtin(Box::new(TestPlugin::new(
2020            "rust-failing-graph-builder",
2021            RUST_TEST_EXTENSIONS,
2022            Some(Box::new(FailingGraphBuilder)),
2023        )));
2024        let config = BuildConfig::default();
2025
2026        let result = build_unified_graph(temp_dir.path(), &plugins, &config);
2027        let err = result.expect_err("all-failures must error");
2028        assert_eq!(
2029            err.to_string(),
2030            "Internal graph builder error: All graph builds failed"
2031        );
2032    }
2033
2034    #[test]
2035    fn test_parse_file_matches_uppercase_extension() {
2036        let temp_dir = TempDir::new().expect("temp dir");
2037        let file_path = temp_dir.path().join("report.Rmd");
2038        fs::write(&file_path, "fn main() {}").expect("write test file");
2039
2040        let mut plugins = PluginManager::new();
2041        plugins.register_builtin(Box::new(TestPlugin::new(
2042            "rust-filename-match",
2043            FILENAME_MATCH_EXTENSIONS,
2044            Some(Box::new(NoopGraphBuilder)),
2045        )));
2046        let mut graph = CodeGraph::new();
2047
2048        let parsed = expect_parsed_file(parse_file(&file_path, &plugins).expect("parse file"));
2049        commit_parsed_file_for_test(&file_path, parsed, &mut graph);
2050    }
2051
2052    #[test]
2053    fn test_parse_file_matches_dotless_filename() {
2054        let temp_dir = TempDir::new().expect("temp dir");
2055        let file_path = temp_dir.path().join("bash_profile");
2056        fs::write(&file_path, "fn main() {}").expect("write test file");
2057
2058        let mut plugins = PluginManager::new();
2059        plugins.register_builtin(Box::new(TestPlugin::new(
2060            "rust-filename-match",
2061            FILENAME_MATCH_EXTENSIONS,
2062            Some(Box::new(NoopGraphBuilder)),
2063        )));
2064        let mut graph = CodeGraph::new();
2065
2066        let parsed = expect_parsed_file(parse_file(&file_path, &plugins).expect("parse file"));
2067        commit_parsed_file_for_test(&file_path, parsed, &mut graph);
2068    }
2069
2070    #[test]
2071    fn test_parse_file_matches_pulumi_stack_filename() {
2072        let temp_dir = TempDir::new().expect("temp dir");
2073        let file_path = temp_dir.path().join("Pulumi.dev.yaml");
2074        fs::write(&file_path, "fn main() {}").expect("write test file");
2075
2076        let mut plugins = PluginManager::new();
2077        plugins.register_builtin(Box::new(TestPlugin::new(
2078            "pulumi",
2079            &["pulumi.yaml"],
2080            Some(Box::new(NoopGraphBuilder)),
2081        )));
2082        let mut graph = CodeGraph::new();
2083
2084        let parsed = expect_parsed_file(parse_file(&file_path, &plugins).expect("parse file"));
2085        commit_parsed_file_for_test(&file_path, parsed, &mut graph);
2086    }
2087
2088    #[test]
2089    fn test_parse_file_returns_timed_out_outcome() {
2090        let temp_dir = TempDir::new().expect("temp dir");
2091        let file_path = temp_dir.path().join("timeout.rs");
2092        fs::write(&file_path, "fn main() {}").expect("write test file");
2093
2094        let mut plugins = PluginManager::new();
2095        plugins.register_builtin(Box::new(TestPlugin::new(
2096            "rust-timeout",
2097            RUST_TEST_EXTENSIONS,
2098            Some(Box::new(TimeoutGraphBuilder)),
2099        )));
2100
2101        let outcome = parse_file(&file_path, &plugins).expect("parse file");
2102        match outcome {
2103            ParsedFileOutcome::TimedOut {
2104                file,
2105                phase,
2106                timeout_ms,
2107            } => {
2108                assert_eq!(file, file_path);
2109                assert_eq!(phase, "test-timeout");
2110                assert_eq!(timeout_ms, 42);
2111            }
2112            other => panic!("expected timed out outcome, got {other:?}"),
2113        }
2114    }
2115
2116    #[test]
2117    fn test_parse_file_rejects_oversized_input() {
2118        let temp_dir = TempDir::new().expect("temp dir");
2119        let file_path = temp_dir.path().join("oversized.rs");
2120        fs::write(&file_path, vec![b'a'; 1_048_577]).expect("write oversized file");
2121
2122        let mut plugins = PluginManager::new();
2123        plugins.register_builtin(Box::new(TestPlugin::new(
2124            "rust-oversized",
2125            RUST_TEST_EXTENSIONS,
2126            Some(Box::new(NoopGraphBuilder)),
2127        )));
2128
2129        unsafe {
2130            std::env::set_var("SQRY_MAX_SOURCE_FILE_SIZE", "1048576");
2131        }
2132        let err = parse_file(&file_path, &plugins).expect_err("oversized file should fail");
2133        unsafe {
2134            std::env::remove_var("SQRY_MAX_SOURCE_FILE_SIZE");
2135        }
2136
2137        let err_text = err.to_string();
2138        assert!(err_text.contains("oversized.rs"));
2139    }
2140
2141    #[test]
2142    fn test_build_unified_graph_skips_timed_out_file_without_partial_commit() {
2143        let temp_dir = TempDir::new().expect("temp dir");
2144        let ok_path = temp_dir.path().join("ok.rs");
2145        let timeout_path = temp_dir.path().join("timeout.rs");
2146        fs::write(&ok_path, "fn ok() {}").expect("write ok file");
2147        fs::write(&timeout_path, "fn timeout() {}").expect("write timeout file");
2148
2149        let mut plugins = PluginManager::new();
2150        plugins.register_builtin(Box::new(TestPlugin::new(
2151            "rust-selective-timeout",
2152            RUST_TEST_EXTENSIONS,
2153            Some(Box::new(SelectiveTimeoutGraphBuilder)),
2154        )));
2155        let config = BuildConfig::default();
2156
2157        let graph = build_unified_graph(temp_dir.path(), &plugins, &config)
2158            .expect("graph build should succeed with surviving files");
2159        let snapshot = graph.snapshot();
2160
2161        assert_eq!(snapshot.find_by_pattern("survivor_fn").len(), 1);
2162        assert!(
2163            snapshot.find_by_pattern("timeout_partial").is_empty(),
2164            "timed out file staging must not be committed"
2165        );
2166    }
2167
2168    // ========================================================================
2169    // Build pipeline consolidation regression tests
2170    // ========================================================================
2171
2172    /// A graph builder that creates a few nodes and edges for testing.
2173    struct SimpleGraphBuilder;
2174
2175    impl GraphBuilder for SimpleGraphBuilder {
2176        fn build_graph(
2177            &self,
2178            _tree: &Tree,
2179            _content: &[u8],
2180            file: &Path,
2181            staging: &mut StagingGraph,
2182        ) -> GraphResult<()> {
2183            use crate::graph::unified::build::helper::GraphBuildHelper;
2184
2185            let mut helper = GraphBuildHelper::new(staging, file, Language::Rust);
2186
2187            // Create two function nodes
2188            let fn1 = helper.add_function("main", None, false, false);
2189            let fn2 = helper.add_function("helper", None, false, false);
2190
2191            // Add a Calls edge from main -> helper
2192            helper.add_call_edge(fn1, fn2);
2193
2194            Ok(())
2195        }
2196
2197        fn language(&self) -> Language {
2198            Language::Rust
2199        }
2200    }
2201
2202    /// `build_and_persist_graph` returns a populated `BuildResult`.
2203    #[test]
2204    fn test_build_and_persist_graph_returns_build_result() {
2205        let temp_dir = TempDir::new().expect("temp dir");
2206        let file_path = temp_dir.path().join("test.rs");
2207        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2208
2209        let mut plugins = PluginManager::new();
2210        plugins.register_builtin(Box::new(TestPlugin::new(
2211            "rust-simple",
2212            RUST_TEST_EXTENSIONS,
2213            Some(Box::new(SimpleGraphBuilder)),
2214        )));
2215        let config = BuildConfig::default();
2216
2217        let result =
2218            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:build_result");
2219        assert!(result.is_ok(), "build_and_persist_graph should succeed");
2220
2221        let (_graph, build_result) = result.unwrap();
2222        assert!(build_result.node_count > 0, "Should have nodes");
2223        assert!(build_result.total_files > 0, "Should have indexed files");
2224        assert!(!build_result.built_at.is_empty(), "Should have timestamp");
2225        assert!(!build_result.root_path.is_empty(), "Should have root path");
2226    }
2227
2228    /// Deduplicated `edge_count` is always <= `raw_edge_count`.
2229    #[test]
2230    fn test_build_result_edge_count_le_raw() {
2231        let temp_dir = TempDir::new().expect("temp dir");
2232        let file_path = temp_dir.path().join("test.rs");
2233        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2234
2235        let mut plugins = PluginManager::new();
2236        plugins.register_builtin(Box::new(TestPlugin::new(
2237            "rust-simple",
2238            RUST_TEST_EXTENSIONS,
2239            Some(Box::new(SimpleGraphBuilder)),
2240        )));
2241        let config = BuildConfig::default();
2242
2243        let (_graph, build_result) =
2244            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:edge_count").unwrap();
2245
2246        assert!(
2247            build_result.edge_count <= build_result.raw_edge_count,
2248            "Deduplicated edge count ({}) should be <= raw edge count ({})",
2249            build_result.edge_count,
2250            build_result.raw_edge_count
2251        );
2252    }
2253
2254    /// File counts use plugin detection (keyed by plugin ID).
2255    #[test]
2256    fn test_build_and_persist_graph_file_counts_use_plugins() {
2257        let temp_dir = TempDir::new().expect("temp dir");
2258        let file_path = temp_dir.path().join("test.rs");
2259        fs::write(&file_path, "fn main() {}").expect("write test file");
2260
2261        let mut plugins = PluginManager::new();
2262        plugins.register_builtin(Box::new(TestPlugin::new(
2263            "rust-simple",
2264            RUST_TEST_EXTENSIONS,
2265            Some(Box::new(SimpleGraphBuilder)),
2266        )));
2267        let config = BuildConfig::default();
2268
2269        let (_graph, build_result) =
2270            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:file_counts")
2271                .unwrap();
2272
2273        // File counts should include the plugin's ID as the language key
2274        assert!(
2275            !build_result.file_count.is_empty(),
2276            "File counts should not be empty"
2277        );
2278        assert!(
2279            build_result.file_count.contains_key("rust-simple"),
2280            "File counts should use plugin ID. Got: {:?}",
2281            build_result.file_count
2282        );
2283    }
2284
2285    /// Manifest `edge_count` matches `BuildResult` (deduplicated).
2286    #[test]
2287    fn test_manifest_edge_count_is_deduplicated() {
2288        use crate::graph::unified::persistence::GraphStorage;
2289
2290        let temp_dir = TempDir::new().expect("temp dir");
2291        let file_path = temp_dir.path().join("test.rs");
2292        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2293
2294        let mut plugins = PluginManager::new();
2295        plugins.register_builtin(Box::new(TestPlugin::new(
2296            "rust-simple",
2297            RUST_TEST_EXTENSIONS,
2298            Some(Box::new(SimpleGraphBuilder)),
2299        )));
2300        let config = BuildConfig::default();
2301
2302        let (_graph, build_result) =
2303            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:manifest_dedup")
2304                .unwrap();
2305
2306        // Load manifest and verify edge counts match BuildResult
2307        let storage = GraphStorage::new(temp_dir.path());
2308        assert!(storage.exists(), "Manifest should exist after build");
2309
2310        let manifest = storage.load_manifest().unwrap();
2311        assert_eq!(
2312            manifest.edge_count, build_result.edge_count,
2313            "Manifest edge_count should match BuildResult (deduplicated)"
2314        );
2315        assert_eq!(
2316            manifest.raw_edge_count,
2317            Some(build_result.raw_edge_count),
2318            "Manifest raw_edge_count should match BuildResult"
2319        );
2320    }
2321
2322    /// Build command provenance is recorded in the manifest.
2323    #[test]
2324    fn test_build_command_provenance() {
2325        use crate::graph::unified::persistence::GraphStorage;
2326
2327        let temp_dir = TempDir::new().expect("temp dir");
2328        let file_path = temp_dir.path().join("test.rs");
2329        fs::write(&file_path, "fn main() {}").expect("write test file");
2330
2331        let mut plugins = PluginManager::new();
2332        plugins.register_builtin(Box::new(TestPlugin::new(
2333            "rust-simple",
2334            RUST_TEST_EXTENSIONS,
2335            Some(Box::new(SimpleGraphBuilder)),
2336        )));
2337        let config = BuildConfig::default();
2338
2339        build_and_persist_graph(temp_dir.path(), &plugins, &config, "cli:index").unwrap();
2340
2341        let storage = GraphStorage::new(temp_dir.path());
2342        let manifest = storage.load_manifest().unwrap();
2343        assert_eq!(
2344            manifest.build_provenance.build_command, "cli:index",
2345            "Build command provenance should match"
2346        );
2347    }
2348
2349    /// Wrapper-based builds infer plugin-selection provenance from the active
2350    /// plugin manager so non-CLI callers do not silently persist legacy-looking
2351    /// manifests.
2352    #[test]
2353    fn test_wrapper_infers_plugin_selection_from_manager() {
2354        use crate::graph::unified::persistence::GraphStorage;
2355
2356        let temp_dir = TempDir::new().expect("temp dir");
2357        let file_path = temp_dir.path().join("test.rs");
2358        fs::write(&file_path, "fn main() {}").expect("write test file");
2359
2360        let mut plugins = PluginManager::new();
2361        plugins.register_builtin(Box::new(TestPlugin::new(
2362            "rust-simple",
2363            RUST_TEST_EXTENSIONS,
2364            Some(Box::new(SimpleGraphBuilder)),
2365        )));
2366        let config = BuildConfig::default();
2367
2368        let (_graph, build_result) =
2369            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:wrapper_plugins")
2370                .expect("wrapper build should succeed");
2371
2372        assert_eq!(
2373            build_result.active_plugin_ids,
2374            vec!["rust-simple".to_string()],
2375            "build result should expose the inferred active plugin ids"
2376        );
2377
2378        let storage = GraphStorage::new(temp_dir.path());
2379        let manifest = storage.load_manifest().expect("manifest should load");
2380        let plugin_selection = manifest
2381            .plugin_selection
2382            .expect("wrapper should persist plugin selection metadata");
2383        assert_eq!(
2384            plugin_selection.active_plugin_ids,
2385            vec!["rust-simple".to_string()],
2386            "wrapper should persist the manager-derived plugin ids"
2387        );
2388        assert_eq!(
2389            plugin_selection.high_cost_mode, None,
2390            "wrapper-inferred plugin selection should keep high_cost_mode diagnostic-only"
2391        );
2392    }
2393
2394    /// Analysis identity hash matches the on-disk manifest bytes hash.
2395    #[test]
2396    fn test_analysis_identity_matches_manifest_hash() {
2397        use crate::graph::unified::analysis::persistence::load_csr;
2398        use crate::graph::unified::persistence::GraphStorage;
2399        use sha2::{Digest, Sha256};
2400
2401        let temp_dir = TempDir::new().expect("temp dir");
2402        let file_path = temp_dir.path().join("test.rs");
2403        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2404
2405        let mut plugins = PluginManager::new();
2406        plugins.register_builtin(Box::new(TestPlugin::new(
2407            "rust-simple",
2408            RUST_TEST_EXTENSIONS,
2409            Some(Box::new(SimpleGraphBuilder)),
2410        )));
2411        let config = BuildConfig::default();
2412
2413        build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:identity").unwrap();
2414
2415        let storage = GraphStorage::new(temp_dir.path());
2416
2417        // Compute manifest hash from on-disk manifest bytes
2418        let manifest_bytes = std::fs::read(storage.manifest_path()).unwrap();
2419        let expected_hash = hex::encode(Sha256::digest(&manifest_bytes));
2420
2421        // Load analysis identity from the CSR file (identity is embedded in each analysis file)
2422        let (_csr, identity) = load_csr(&storage.analysis_csr_path()).unwrap();
2423
2424        assert_eq!(
2425            identity.manifest_hash, expected_hash,
2426            "On-disk manifest hash should equal analysis identity hash"
2427        );
2428    }
2429
2430    /// Regression test: old manifest is removed at start of rebuild.
2431    ///
2432    /// Verifies that `build_and_persist_graph_with_progress()` removes any
2433    /// existing manifest before writing the new snapshot. This prevents the
2434    /// inconsistent state where an old manifest pairs with a new snapshot
2435    /// after an interrupted rebuild.
2436    #[test]
2437    fn test_old_manifest_removed_during_rebuild() {
2438        use crate::graph::unified::persistence::GraphStorage;
2439
2440        let temp_dir = tempfile::TempDir::new().unwrap();
2441        let src = temp_dir.path().join("lib.rs");
2442        std::fs::write(&src, "fn main() {}").unwrap();
2443
2444        // Build an initial index
2445        let mut plugins = PluginManager::new();
2446        plugins.register_builtin(Box::new(TestPlugin::new(
2447            "rust-simple",
2448            RUST_TEST_EXTENSIONS,
2449            Some(Box::new(SimpleGraphBuilder)),
2450        )));
2451        let config = BuildConfig::default();
2452        build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:initial").unwrap();
2453
2454        let storage = GraphStorage::new(temp_dir.path());
2455        assert!(
2456            storage.exists(),
2457            "Manifest should exist after initial build"
2458        );
2459
2460        // Record the original manifest's built_at timestamp
2461        let original_manifest = storage.load_manifest().unwrap();
2462        let original_built_at = original_manifest.built_at.clone();
2463
2464        // Rebuild — during the build, the old manifest should be removed first
2465        build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:rebuild").unwrap();
2466
2467        // Verify the manifest was replaced (different built_at timestamp)
2468        let new_manifest = storage.load_manifest().unwrap();
2469        assert_ne!(
2470            original_built_at, new_manifest.built_at,
2471            "Manifest should have been replaced with new timestamp"
2472        );
2473        assert_eq!(
2474            new_manifest.build_provenance.build_command, "test:rebuild",
2475            "Manifest should reflect the rebuild provenance"
2476        );
2477    }
2478
2479    /// Regression test: failed rebuild leaves index in non-ready state.
2480    ///
2481    /// Exercises the real pipeline by making the analysis directory
2482    /// non-writable after an initial build, then attempting a rebuild.
2483    /// The pipeline should:
2484    ///   1. Remove the old manifest (Step 2) — making `exists()` false.
2485    ///   2. Write the new snapshot (Step 3).
2486    ///   3. Fail at analysis persistence (Step 9) because the directory
2487    ///      is not writable.
2488    ///   4. Return an error — manifest is NEVER written.
2489    ///
2490    /// After the failed rebuild, `storage.exists()` must be false (old
2491    /// manifest removed), even though the snapshot file was updated.
2492    #[test]
2493    fn test_failed_rebuild_leaves_index_not_ready() {
2494        use crate::graph::unified::persistence::GraphStorage;
2495
2496        let temp_dir = tempfile::TempDir::new().unwrap();
2497        let src = temp_dir.path().join("lib.rs");
2498        std::fs::write(&src, "fn main() {}").unwrap();
2499
2500        // Build an initial index (success)
2501        let mut plugins = PluginManager::new();
2502        plugins.register_builtin(Box::new(TestPlugin::new(
2503            "rust-simple",
2504            RUST_TEST_EXTENSIONS,
2505            Some(Box::new(SimpleGraphBuilder)),
2506        )));
2507        let config = BuildConfig::default();
2508        build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:initial").unwrap();
2509
2510        let storage = GraphStorage::new(temp_dir.path());
2511        assert!(
2512            storage.exists(),
2513            "Manifest should exist after initial build"
2514        );
2515
2516        // Replace the analysis directory with a regular file to force a
2517        // failure at Step 9 (analysis persistence). `create_dir_all` will
2518        // fail because a regular file exists where a directory is expected.
2519        // This simulates the real failure window between snapshot write
2520        // (Step 3) and manifest write (Step 10).
2521        let analysis_dir = storage.analysis_dir().to_path_buf();
2522        std::fs::remove_dir_all(&analysis_dir).unwrap();
2523        std::fs::write(&analysis_dir, b"blocker").unwrap();
2524
2525        // Attempt rebuild — should fail at analysis persistence
2526        let result =
2527            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:failed_rebuild");
2528
2529        // Restore analysis dir so TempDir cleanup succeeds
2530        std::fs::remove_file(&analysis_dir).unwrap();
2531        std::fs::create_dir_all(&analysis_dir).unwrap();
2532
2533        // The build should have failed
2534        assert!(
2535            result.is_err(),
2536            "Rebuild should fail when analysis dir is read-only"
2537        );
2538
2539        // The old manifest should have been removed (Step 2 ran before failure)
2540        assert!(
2541            !storage.exists(),
2542            "After failed rebuild, manifest should have been removed — index is NOT ready"
2543        );
2544
2545        // The snapshot was updated (Step 3 succeeded before failure)
2546        assert!(
2547            storage.snapshot_exists(),
2548            "Snapshot should still exist on disk (written before failure)"
2549        );
2550    }
2551
2552    // ===== CSR Compaction Persistence Regression Tests =====
2553
2554    /// Graph builder that creates duplicate edges to exercise `raw_edge_count` > `edge_count`.
2555    struct DuplicateCallsGraphBuilder;
2556
2557    impl GraphBuilder for DuplicateCallsGraphBuilder {
2558        fn build_graph(
2559            &self,
2560            _tree: &Tree,
2561            _content: &[u8],
2562            file: &Path,
2563            staging: &mut StagingGraph,
2564        ) -> GraphResult<()> {
2565            use crate::graph::unified::build::helper::GraphBuildHelper;
2566
2567            let mut helper = GraphBuildHelper::new(staging, file, Language::Rust);
2568            let fn1 = helper.add_function("main", None, false, false);
2569            let fn2 = helper.add_function("helper", None, false, false);
2570
2571            // Add the same Calls edge twice to create a duplicate
2572            helper.add_call_edge(fn1, fn2);
2573            helper.add_call_edge(fn1, fn2);
2574
2575            Ok(())
2576        }
2577
2578        fn language(&self) -> Language {
2579            Language::Rust
2580        }
2581    }
2582
2583    /// Persisted snapshot has CSR on both stores and empty deltas.
2584    #[test]
2585    fn test_persisted_snapshot_compacts_both_edge_stores_before_save() {
2586        use crate::graph::unified::persistence::{GraphStorage, load_from_path};
2587
2588        let temp_dir = TempDir::new().expect("temp dir");
2589        let file_path = temp_dir.path().join("test.rs");
2590        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2591
2592        let mut plugins = PluginManager::new();
2593        plugins.register_builtin(Box::new(TestPlugin::new(
2594            "rust-simple",
2595            RUST_TEST_EXTENSIONS,
2596            Some(Box::new(SimpleGraphBuilder)),
2597        )));
2598        let config = BuildConfig::default();
2599
2600        let _result =
2601            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:csr_compact")
2602                .expect("build should succeed");
2603
2604        // Load the persisted snapshot and verify CSR state
2605        let storage = GraphStorage::new(temp_dir.path());
2606        let loaded = load_from_path(storage.snapshot_path(), None).expect("load should succeed");
2607
2608        assert!(
2609            loaded.edges().forward().csr().is_some(),
2610            "Forward store must have CSR after persistence"
2611        );
2612        assert!(
2613            loaded.edges().reverse().csr().is_some(),
2614            "Reverse store must have CSR after persistence"
2615        );
2616
2617        let stats = loaded.edges().stats();
2618        assert_eq!(
2619            stats.forward.delta_edge_count, 0,
2620            "Forward delta must be empty after persistence"
2621        );
2622        assert_eq!(
2623            stats.reverse.delta_edge_count, 0,
2624            "Reverse delta must be empty after persistence"
2625        );
2626    }
2627
2628    /// Loaded snapshot supports reverse traversal (direct-callers / `edges_to`).
2629    #[test]
2630    fn test_loaded_snapshot_edges_to_works_after_round_trip() {
2631        use crate::graph::unified::edge::EdgeKind;
2632        use crate::graph::unified::persistence::{GraphStorage, load_from_path};
2633        use crate::graph::unified::{
2634            FileScope, ResolutionMode, SymbolCandidateOutcome, SymbolQuery,
2635        };
2636
2637        let temp_dir = TempDir::new().expect("temp dir");
2638        let file_path = temp_dir.path().join("test.rs");
2639        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2640
2641        let mut plugins = PluginManager::new();
2642        plugins.register_builtin(Box::new(TestPlugin::new(
2643            "rust-simple",
2644            RUST_TEST_EXTENSIONS,
2645            Some(Box::new(SimpleGraphBuilder)),
2646        )));
2647        let config = BuildConfig::default();
2648
2649        build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:round_trip")
2650            .expect("build should succeed");
2651
2652        let storage = GraphStorage::new(temp_dir.path());
2653        let loaded = load_from_path(storage.snapshot_path(), None).expect("load should succeed");
2654
2655        // Find main and helper node IDs through symbol resolution
2656        let snapshot = loaded.snapshot();
2657
2658        let main_id = match snapshot.find_symbol_candidates(&SymbolQuery {
2659            symbol: "main",
2660            file_scope: FileScope::Any,
2661            mode: ResolutionMode::AllowSuffixCandidates,
2662        }) {
2663            SymbolCandidateOutcome::Candidates(ids) => ids[0],
2664            _ => panic!("main node must exist"),
2665        };
2666
2667        let helper_id = match snapshot.find_symbol_candidates(&SymbolQuery {
2668            symbol: "helper",
2669            file_scope: FileScope::Any,
2670            mode: ResolutionMode::AllowSuffixCandidates,
2671        }) {
2672            SymbolCandidateOutcome::Candidates(ids) => ids[0],
2673            _ => panic!("helper node must exist"),
2674        };
2675
2676        // Forward: main -> helper
2677        let forward_edges = loaded.edges().edges_from(main_id);
2678        let has_call = forward_edges
2679            .iter()
2680            .any(|e| e.target == helper_id && matches!(e.kind, EdgeKind::Calls { .. }));
2681        assert!(has_call, "Forward traversal: main should call helper");
2682
2683        // Reverse: helper <- main (the critical regression check)
2684        let reverse_edges = loaded.edges().edges_to(helper_id);
2685        let has_caller = reverse_edges
2686            .iter()
2687            .any(|e| e.source == main_id && matches!(e.kind, EdgeKind::Calls { .. }));
2688        assert!(
2689            has_caller,
2690            "Reverse traversal: helper should have main as caller"
2691        );
2692    }
2693
2694    /// `raw_edge_count` >= `edge_count` still holds after pre-save compaction.
2695    #[test]
2696    fn test_raw_edge_count_preserved_across_pre_save_compaction() {
2697        use crate::graph::unified::persistence::GraphStorage;
2698
2699        let temp_dir = TempDir::new().expect("temp dir");
2700        let file_path = temp_dir.path().join("test.rs");
2701        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2702
2703        let mut plugins = PluginManager::new();
2704        plugins.register_builtin(Box::new(TestPlugin::new(
2705            "rust-dup",
2706            RUST_TEST_EXTENSIONS,
2707            Some(Box::new(DuplicateCallsGraphBuilder)),
2708        )));
2709        let config = BuildConfig::default();
2710
2711        let (_graph, build_result) =
2712            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:raw_edge_count")
2713                .expect("build should succeed");
2714
2715        assert!(
2716            build_result.raw_edge_count > build_result.edge_count,
2717            "raw_edge_count ({}) must be > edge_count ({}) for duplicate builder",
2718            build_result.raw_edge_count,
2719            build_result.edge_count
2720        );
2721
2722        // Verify manifest matches
2723        let storage = GraphStorage::new(temp_dir.path());
2724        let manifest = storage.load_manifest().expect("manifest should load");
2725
2726        assert_eq!(
2727            manifest.raw_edge_count,
2728            Some(build_result.raw_edge_count),
2729            "Manifest raw_edge_count must match build result"
2730        );
2731        assert_eq!(
2732            manifest.edge_count, build_result.edge_count,
2733            "Manifest edge_count must match build result"
2734        );
2735    }
2736
2737    /// Full round-trip: build -> save -> load -> query produces correct results.
2738    #[test]
2739    fn test_build_save_load_query_round_trip_preserves_edge_queries() {
2740        use crate::graph::unified::edge::EdgeKind;
2741        use crate::graph::unified::persistence::{GraphStorage, load_from_path};
2742        use crate::graph::unified::{
2743            FileScope, ResolutionMode, SymbolCandidateOutcome, SymbolQuery,
2744        };
2745
2746        let temp_dir = TempDir::new().expect("temp dir");
2747        let file_path = temp_dir.path().join("test.rs");
2748        fs::write(&file_path, "fn main() {} fn helper() {}").expect("write test file");
2749
2750        let mut plugins = PluginManager::new();
2751        plugins.register_builtin(Box::new(TestPlugin::new(
2752            "rust-simple",
2753            RUST_TEST_EXTENSIONS,
2754            Some(Box::new(SimpleGraphBuilder)),
2755        )));
2756        let config = BuildConfig::default();
2757
2758        let (_original_graph, build_result) =
2759            build_and_persist_graph(temp_dir.path(), &plugins, &config, "test:full_round_trip")
2760                .expect("build should succeed");
2761
2762        // Load from disk
2763        let storage = GraphStorage::new(temp_dir.path());
2764        let loaded = load_from_path(storage.snapshot_path(), None).expect("load should succeed");
2765
2766        // Edge count on loaded graph should match dedup count
2767        assert_eq!(
2768            loaded.edge_count(),
2769            build_result.edge_count,
2770            "Loaded graph edge count must match build result dedup count"
2771        );
2772
2773        // Node count should match
2774        assert_eq!(
2775            loaded.node_count(),
2776            build_result.node_count,
2777            "Loaded graph node count must match build result"
2778        );
2779
2780        // Verify edge queries work on loaded graph
2781        let snapshot = loaded.snapshot();
2782
2783        let main_id = match snapshot.find_symbol_candidates(&SymbolQuery {
2784            symbol: "main",
2785            file_scope: FileScope::Any,
2786            mode: ResolutionMode::AllowSuffixCandidates,
2787        }) {
2788            SymbolCandidateOutcome::Candidates(ids) => {
2789                assert!(!ids.is_empty(), "main must exist");
2790                ids[0]
2791            }
2792            _ => panic!("main node must exist"),
2793        };
2794
2795        let helper_id = match snapshot.find_symbol_candidates(&SymbolQuery {
2796            symbol: "helper",
2797            file_scope: FileScope::Any,
2798            mode: ResolutionMode::AllowSuffixCandidates,
2799        }) {
2800            SymbolCandidateOutcome::Candidates(ids) => {
2801                assert!(!ids.is_empty(), "helper must exist");
2802                ids[0]
2803            }
2804            _ => panic!("helper node must exist"),
2805        };
2806
2807        // Forward query: main calls helper
2808        let fwd = loaded.edges().edges_from(main_id);
2809        let has_fwd_call = fwd
2810            .iter()
2811            .any(|e| e.target == helper_id && matches!(e.kind, EdgeKind::Calls { .. }));
2812        assert!(has_fwd_call, "edges_from(main) must include call to helper");
2813
2814        // Reverse query: helper called by main
2815        let rev = loaded.edges().edges_to(helper_id);
2816        let has_rev_call = rev
2817            .iter()
2818            .any(|e| e.source == main_id && matches!(e.kind, EdgeKind::Calls { .. }));
2819        assert!(has_rev_call, "edges_to(helper) must include caller main");
2820    }
2821
2822    // -----------------------------------------------------------------
2823    // Phase 7c cancellation wire-through tests (task 7 phase 7c)
2824    // -----------------------------------------------------------------
2825    //
2826    // The four cancellation-boundary tests below exercise the pipeline
2827    // at distinct points in `build_unified_graph_inner`:
2828    //
2829    //   1. preflight — token cancelled before the first boundary; no
2830    //      FS walk, no parse, no Phase 4 work.
2831    //   2. mid-chunk — token flipped after the first chunk commits via
2832    //      the AfterChunkHookGuard; second chunk never parses.
2833    //   3. pre-Phase-4 — token flipped after the chunk loop exits via
2834    //      the BeforePhase4HookGuard; Phase 4a+ never runs.
2835    //   4. pre-Pass-5 — token flipped before cross-language linking
2836    //      via the BeforePass5HookGuard; Pass 5 never runs.
2837    //
2838    // A fifth test confirms the backwards-compatible default path
2839    // (no cancellation arg) still returns a fully-built graph.
2840
2841    fn build_rust_test_fixture(dir: &Path, file_count: usize) {
2842        for i in 0..file_count {
2843            let path = dir.join(format!("fixture_{i}.rs"));
2844            fs::write(&path, format!("pub fn fn_{i}() {{ let _ = {i}; }}")).expect("write fixture");
2845        }
2846    }
2847
2848    fn make_rust_test_plugins() -> PluginManager {
2849        let mut plugins = PluginManager::new();
2850        plugins.register_builtin(Box::new(TestPlugin::new(
2851            "rust-noop-for-cancellation-tests",
2852            RUST_TEST_EXTENSIONS,
2853            Some(Box::new(NoopGraphBuilder)),
2854        )));
2855        plugins
2856    }
2857
2858    #[test]
2859    fn build_unified_graph_cancellable_preflight_cancellation_returns_cancelled() {
2860        let tmp = TempDir::new().expect("tmp");
2861        build_rust_test_fixture(tmp.path(), 4);
2862        let plugins = make_rust_test_plugins();
2863        let config = BuildConfig::default();
2864
2865        let cancel = CancellationToken::new();
2866        cancel.cancel();
2867
2868        let result = build_unified_graph_cancellable(tmp.path(), &plugins, &config, &cancel);
2869        let err = result.expect_err("pre-cancelled token must short-circuit");
2870        assert!(
2871            matches!(err, GraphBuilderError::Cancelled),
2872            "expected Cancelled, got: {err:?}"
2873        );
2874    }
2875
2876    #[test]
2877    fn build_unified_graph_cancellable_mid_chunk_cancellation_returns_cancelled() {
2878        let tmp = TempDir::new().expect("tmp");
2879        // Force multiple chunks by setting a tiny staging_memory_limit.
2880        build_rust_test_fixture(tmp.path(), 8);
2881        let plugins = make_rust_test_plugins();
2882        // A very small memory limit forces ~1 file per chunk.
2883        let config = BuildConfig {
2884            staging_memory_limit: 1,
2885            ..BuildConfig::default()
2886        };
2887
2888        let cancel = CancellationToken::new();
2889
2890        // Install a hook that cancels after the FIRST chunk. The hook
2891        // fires at the TOP of every chunk iteration (including chunk 0
2892        // before cancelling). We cancel on the first call; the next
2893        // iteration's top-of-loop `cancellation.check()` short-circuits.
2894        let cancel_for_hook = cancel.clone();
2895        let mut call_count = 0u32;
2896        let _guard = testing::AfterChunkHookGuard::install(move |tok| {
2897            call_count += 1;
2898            if call_count >= 2 {
2899                cancel_for_hook.cancel();
2900                // `tok` is the same shared Arc under the hood.
2901                assert!(tok.is_cancelled());
2902            }
2903        });
2904
2905        let result = build_unified_graph_cancellable(tmp.path(), &plugins, &config, &cancel);
2906        let err = result.expect_err("mid-chunk cancellation must short-circuit");
2907        assert!(
2908            matches!(err, GraphBuilderError::Cancelled),
2909            "expected Cancelled, got: {err:?}"
2910        );
2911    }
2912
2913    #[test]
2914    fn build_unified_graph_cancellable_pre_phase4_cancellation_short_circuits() {
2915        let tmp = TempDir::new().expect("tmp");
2916        build_rust_test_fixture(tmp.path(), 4);
2917        let plugins = make_rust_test_plugins();
2918        let config = BuildConfig::default();
2919
2920        let cancel = CancellationToken::new();
2921        let cancel_for_hook = cancel.clone();
2922        let _guard = testing::BeforePhase4HookGuard::install(move |_tok| {
2923            cancel_for_hook.cancel();
2924        });
2925
2926        let result = build_unified_graph_cancellable(tmp.path(), &plugins, &config, &cancel);
2927        let err = result.expect_err("pre-Phase-4 cancellation must short-circuit");
2928        assert!(
2929            matches!(err, GraphBuilderError::Cancelled),
2930            "expected Cancelled, got: {err:?}"
2931        );
2932    }
2933
2934    #[test]
2935    fn build_unified_graph_cancellable_pre_pass5_cancellation_short_circuits() {
2936        let tmp = TempDir::new().expect("tmp");
2937        build_rust_test_fixture(tmp.path(), 4);
2938        let plugins = make_rust_test_plugins();
2939        let config = BuildConfig::default();
2940
2941        let cancel = CancellationToken::new();
2942        let cancel_for_hook = cancel.clone();
2943        let _guard = testing::BeforePass5HookGuard::install(move |_tok| {
2944            cancel_for_hook.cancel();
2945        });
2946
2947        let result = build_unified_graph_cancellable(tmp.path(), &plugins, &config, &cancel);
2948        let err = result.expect_err("pre-Pass-5 cancellation must short-circuit");
2949        assert!(
2950            matches!(err, GraphBuilderError::Cancelled),
2951            "expected Cancelled, got: {err:?}"
2952        );
2953    }
2954
2955    #[test]
2956    fn build_unified_graph_default_path_is_backwards_compatible() {
2957        let tmp = TempDir::new().expect("tmp");
2958        build_rust_test_fixture(tmp.path(), 3);
2959        let plugins = make_rust_test_plugins();
2960        let config = BuildConfig::default();
2961
2962        // Legacy API: no cancellation parameter. Must return a
2963        // built graph without triggering cancellation short-circuits.
2964        // (The test plugin is a NoopGraphBuilder that produces zero
2965        // nodes; we only assert the success path returns Ok.)
2966        let _graph = build_unified_graph(tmp.path(), &plugins, &config)
2967            .expect("legacy path must still build successfully");
2968    }
2969}