Skip to main content

sqry_cli/commands/
analyze.rs

1//! Analyze command implementation
2//!
3//! Builds precomputed graph analyses (Pass 5) for fast query-time performance.
4
5use crate::args::Cli;
6use crate::commands::graph::loader::{GraphLoadConfig, load_unified_graph_for_cli, no_op_reporter};
7use crate::index_discovery::find_nearest_index;
8use crate::output::OutputStreams;
9use anyhow::{Context, Result};
10use serde::Serialize;
11use sqry_core::graph::unified::analysis::{
12    AnalysisIdentity, GraphAnalyses, compute_manifest_hash, compute_node_id_hash,
13    resolve_label_budget_config,
14};
15use sqry_core::graph::unified::compaction::snapshot_edges;
16use sqry_core::graph::unified::persistence::GraphStorage;
17use std::time::Instant;
18
19/// Analysis statistics for output
20#[derive(Debug, Serialize)]
21struct AnalysisStats {
22    /// Total nodes in graph
23    node_count: u32,
24    /// Total edges in graph
25    edge_count: u32,
26    /// SCC statistics per edge kind
27    scc_stats: Vec<SccStats>,
28    /// Analysis build time in seconds
29    build_time_secs: f64,
30}
31
32#[derive(Debug, Serialize)]
33struct SccStats {
34    edge_kind: String,
35    scc_count: u32,
36    non_trivial_count: u32,
37    max_scc_size: u32,
38}
39
40fn has_fresh_analysis(storage: &GraphStorage) -> bool {
41    let manifest_hash = compute_manifest_hash(storage.manifest_path()).ok();
42    manifest_hash.is_some_and(|hash| {
43        ["calls", "imports", "references", "inherits"]
44            .iter()
45            .all(|kind| {
46                let scc_path = storage.analysis_scc_path(kind);
47                let cond_path = storage.analysis_cond_path(kind);
48                scc_path.exists()
49                    && cond_path.exists()
50                    && sqry_core::graph::unified::analysis::persistence::load_scc_manifest_checked(
51                        &scc_path, &hash,
52                    )
53                    .is_ok()
54                    && sqry_core::graph::unified::analysis::persistence::load_condensation_manifest_checked(
55                        &cond_path, &hash,
56                    )
57                    .is_ok()
58            })
59    })
60}
61
62fn collect_analysis_stats(
63    analyses: &GraphAnalyses,
64    build_time: std::time::Duration,
65) -> AnalysisStats {
66    AnalysisStats {
67        node_count: analyses.adjacency.node_count,
68        edge_count: analyses.adjacency.edge_count,
69        scc_stats: vec![
70            SccStats {
71                edge_kind: "calls".to_string(),
72                scc_count: analyses.scc_calls.scc_count,
73                non_trivial_count: analyses.scc_calls.non_trivial_count,
74                max_scc_size: analyses.scc_calls.max_scc_size,
75            },
76            SccStats {
77                edge_kind: "imports".to_string(),
78                scc_count: analyses.scc_imports.scc_count,
79                non_trivial_count: analyses.scc_imports.non_trivial_count,
80                max_scc_size: analyses.scc_imports.max_scc_size,
81            },
82            SccStats {
83                edge_kind: "references".to_string(),
84                scc_count: analyses.scc_references.scc_count,
85                non_trivial_count: analyses.scc_references.non_trivial_count,
86                max_scc_size: analyses.scc_references.max_scc_size,
87            },
88            SccStats {
89                edge_kind: "inherits".to_string(),
90                scc_count: analyses.scc_inherits.scc_count,
91                non_trivial_count: analyses.scc_inherits.non_trivial_count,
92                max_scc_size: analyses.scc_inherits.max_scc_size,
93            },
94        ],
95        build_time_secs: build_time.as_secs_f64(),
96    }
97}
98
99/// Run the analyze command.
100///
101/// Builds precomputed graph analyses (CSR, SCC, Condensation DAG, 2-hop labels)
102/// and persists them to .sqry/analysis/ for fast query-time performance.
103///
104/// Analysis settings are resolved with precedence: CLI args > config file > env vars > compiled defaults.
105///
106/// # Errors
107/// Returns an error if the graph cannot be loaded or analyses cannot be built.
108#[allow(clippy::too_many_arguments)]
109#[allow(clippy::too_many_lines)] // Sequential CLI orchestration across index discovery, graph loading, analysis execution, persistence, and user-facing output.
110pub fn run_analyze(
111    cli: &Cli,
112    path: Option<&str>,
113    force: bool,
114    threads: Option<usize>,
115    label_budget: Option<u64>,
116    density_threshold: Option<u64>,
117    budget_exceeded_policy: Option<&str>,
118    no_labels: bool,
119) -> Result<()> {
120    let mut streams = OutputStreams::new();
121
122    // Find index
123    let search_path = path.map_or_else(
124        || std::env::current_dir().unwrap_or_default(),
125        std::path::PathBuf::from,
126    );
127
128    let index_location = find_nearest_index(&search_path);
129    let Some(ref loc) = index_location else {
130        streams
131            .write_diagnostic("No .sqry-index found. Run 'sqry index' first to build the index.")?;
132        return Ok(());
133    };
134
135    streams.write_diagnostic("Building graph analyses...")?;
136
137    // Load unified graph
138    let config = GraphLoadConfig::default();
139    let graph = load_unified_graph_for_cli(&loc.index_root, &config, cli, no_op_reporter())
140        .context("Failed to load graph. Run 'sqry index' to build the graph.")?;
141
142    // Check if full analysis artifacts already exist AND match the current manifest.
143    // After `sqry index --force`, stale SCC/DAG files from a previous analysis may
144    // remain on disk. We validate against the manifest hash to detect this.
145    let storage = GraphStorage::new(&loc.index_root);
146    let analysis_dir = storage.analysis_dir();
147    if !force && has_fresh_analysis(&storage) {
148        streams.write_diagnostic(
149            "Analysis files already exist and match current index. Use --force to rebuild.",
150        )?;
151        return Ok(());
152    }
153
154    // Resolve analysis settings: CLI args > config file > env vars > compiled defaults
155    let label_budget_config = resolve_label_budget_config(
156        &loc.index_root,
157        label_budget,
158        density_threshold,
159        budget_exceeded_policy,
160        no_labels,
161    )
162    .context("Failed to resolve analysis budget configuration")?;
163
164    // Build compaction snapshot from graph
165    streams.write_diagnostic("Creating compaction snapshot...")?;
166    let graph_snapshot = graph.snapshot();
167    let edges = graph_snapshot.edges();
168    let forward_store = edges.forward();
169    let node_count = graph_snapshot.nodes().len();
170    let snapshot = snapshot_edges(&forward_store, node_count);
171
172    let manifest_hash = compute_manifest_hash(storage.manifest_path())
173        .context("Failed to compute manifest hash for analysis identity")?;
174    let node_id_hash = compute_node_id_hash(&graph_snapshot);
175    let identity = AnalysisIdentity::new(manifest_hash, node_id_hash);
176
177    // Build all analyses
178    let phase_desc = if label_budget_config.skip_labels {
179        "CSR + SCC + Condensation (labels skipped)"
180    } else {
181        "CSR + SCC + Condensation + 2-hop labels"
182    };
183    streams.write_diagnostic(&format!("Computing analyses ({phase_desc})..."))?;
184    let start = Instant::now();
185    let analyses = if let Some(n) = threads {
186        let pool = rayon::ThreadPoolBuilder::new()
187            .num_threads(n)
188            .build()
189            .context("Failed to create rayon thread pool for analysis")?;
190        pool.install(|| GraphAnalyses::build_all_with_budget(&snapshot, &label_budget_config))
191            .context("Failed to build graph analyses")?
192    } else {
193        GraphAnalyses::build_all_with_budget(&snapshot, &label_budget_config)
194            .context("Failed to build graph analyses")?
195    };
196    let build_time = start.elapsed();
197
198    // Persist to disk
199    streams.write_diagnostic("Persisting analyses to disk...")?;
200    analyses
201        .persist_all(&storage, &identity)
202        .context("Failed to persist analyses")?;
203
204    let stats = collect_analysis_stats(&analyses, build_time);
205
206    // Output
207    if cli.json {
208        let json = serde_json::to_string_pretty(&stats).context("Failed to serialize to JSON")?;
209        streams.write_result(&json)?;
210    } else {
211        let output = format_stats_text(&stats, analysis_dir);
212        streams.write_result(&output)?;
213    }
214
215    Ok(())
216}
217
218/// Format analysis statistics as human-readable text
219fn format_stats_text(stats: &AnalysisStats, analysis_dir: &std::path::Path) -> String {
220    let mut lines = Vec::new();
221
222    lines.push("✓ Graph analysis complete".to_string());
223    lines.push(String::new());
224
225    lines.push(format!(
226        "Graph: {} nodes, {} edges",
227        stats.node_count, stats.edge_count
228    ));
229    lines.push(format!("Build time: {:.2}s", stats.build_time_secs));
230    lines.push(String::new());
231
232    lines.push("SCC Analysis:".to_string());
233    for scc_stat in &stats.scc_stats {
234        lines.push(format!(
235            "  {}: {} SCCs ({} non-trivial, max size: {})",
236            scc_stat.edge_kind,
237            scc_stat.scc_count,
238            scc_stat.non_trivial_count,
239            scc_stat.max_scc_size
240        ));
241    }
242    lines.push(String::new());
243
244    lines.push(format!(
245        "Analysis files written to: {}",
246        analysis_dir.display()
247    ));
248    lines.push("  - adjacency.csr (CSR adjacency matrix)".to_string());
249    lines.push(
250        "  - scc_calls.scc, scc_imports.scc, scc_references.scc, scc_inherits.scc".to_string(),
251    );
252    lines.push(
253        "  - cond_calls.dag, cond_imports.dag, cond_references.dag, cond_inherits.dag".to_string(),
254    );
255
256    lines.join("\n")
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262    use sqry_core::graph::unified::analysis::condensation::{
263        CondensationDag, ReachabilityStrategy,
264    };
265    use sqry_core::graph::unified::analysis::csr::CsrAdjacency;
266    use sqry_core::graph::unified::analysis::persistence::{
267        AnalysisIdentity, persist_condensation, persist_scc,
268    };
269    use sqry_core::graph::unified::analysis::scc::SccData;
270    use sqry_core::graph::unified::edge::{EdgeKind, ResolvedVia};
271    use sqry_core::graph::unified::persistence::GraphStorage;
272    use std::time::Duration;
273
274    /// Create a minimal `SccData` for a given edge kind.
275    fn make_scc(edge_kind: EdgeKind, scc_count: u32) -> SccData {
276        SccData {
277            edge_kind,
278            node_count: 10,
279            scc_count,
280            non_trivial_count: u32::from(scc_count > 1),
281            max_scc_size: if scc_count > 1 { 3 } else { 1 },
282            node_to_scc: vec![0; 10],
283            scc_offsets: vec![0, 10],
284            scc_members: (0..10).collect(),
285            has_self_loop: vec![false],
286        }
287    }
288
289    /// Create a minimal `CondensationDag` for a given edge kind.
290    fn make_cond(edge_kind: EdgeKind) -> CondensationDag {
291        CondensationDag {
292            edge_kind,
293            scc_count: 1,
294            edge_count: 0,
295            row_offsets: vec![0, 0],
296            col_indices: vec![],
297            topo_order: vec![0],
298            label_out_offsets: vec![0, 0],
299            label_out_data: vec![],
300            label_in_offsets: vec![0, 0],
301            label_in_data: vec![],
302            strategy: ReachabilityStrategy::DagBfs,
303        }
304    }
305
306    /// Create the four edge kinds used by analysis in canonical order.
307    fn analysis_edge_kinds() -> Vec<(&'static str, EdgeKind)> {
308        vec![
309            (
310                "calls",
311                EdgeKind::Calls {
312                    argument_count: 0,
313                    is_async: false,
314                    resolved_via: ResolvedVia::Direct,
315                },
316            ),
317            (
318                "imports",
319                EdgeKind::Imports {
320                    alias: None,
321                    is_wildcard: false,
322                },
323            ),
324            ("references", EdgeKind::References),
325            ("inherits", EdgeKind::Inherits),
326        ]
327    }
328
329    /// Write a manifest.json and all 8 analysis files (4 SCC + 4 condensation)
330    /// using the given manifest hash.
331    fn write_analysis_files(root: &std::path::Path, manifest_hash: &str) {
332        let storage = GraphStorage::new(root);
333        let identity = AnalysisIdentity::new(manifest_hash.to_string(), [0u8; 32]);
334        std::fs::create_dir_all(storage.analysis_dir()).unwrap();
335
336        for (kind_str, edge_kind) in analysis_edge_kinds() {
337            let scc = make_scc(edge_kind.clone(), 5);
338            persist_scc(&scc, &identity, &storage.analysis_scc_path(kind_str)).unwrap();
339
340            let cond = make_cond(edge_kind);
341            persist_condensation(&cond, &identity, &storage.analysis_cond_path(kind_str)).unwrap();
342        }
343    }
344
345    /// Write a manifest.json file with given content and return its SHA-256 hash.
346    fn write_manifest(root: &std::path::Path, content: &str) -> String {
347        let storage = GraphStorage::new(root);
348        std::fs::create_dir_all(storage.graph_dir()).unwrap();
349        std::fs::write(storage.manifest_path(), content).unwrap();
350        compute_manifest_hash(storage.manifest_path()).unwrap()
351    }
352
353    // ========================================================================
354    // has_fresh_analysis tests
355    // ========================================================================
356
357    #[test]
358    fn has_fresh_analysis_false_when_no_files_exist() {
359        let tmp = tempfile::tempdir().unwrap();
360        let root = tmp.path();
361
362        // Write only a manifest, but no analysis files.
363        write_manifest(root, r#"{"version":"1.0"}"#);
364
365        let storage = GraphStorage::new(root);
366        assert!(!has_fresh_analysis(&storage));
367    }
368
369    #[test]
370    fn has_fresh_analysis_false_when_no_manifest_exists() {
371        let tmp = tempfile::tempdir().unwrap();
372        let root = tmp.path();
373
374        // No manifest at all — compute_manifest_hash should fail.
375        let storage = GraphStorage::new(root);
376        assert!(!has_fresh_analysis(&storage));
377    }
378
379    #[test]
380    fn has_fresh_analysis_true_when_all_files_match() {
381        let tmp = tempfile::tempdir().unwrap();
382        let root = tmp.path();
383
384        let hash = write_manifest(root, r#"{"version":"1.0"}"#);
385        write_analysis_files(root, &hash);
386
387        let storage = GraphStorage::new(root);
388        assert!(has_fresh_analysis(&storage));
389    }
390
391    #[test]
392    fn has_fresh_analysis_false_when_manifest_hash_mismatches() {
393        let tmp = tempfile::tempdir().unwrap();
394        let root = tmp.path();
395
396        // Write analysis files with one hash, then change the manifest so the
397        // hash no longer matches.
398        let _old_hash = write_manifest(root, r#"{"version":"1.0"}"#);
399        write_analysis_files(root, "stale_hash_that_wont_match");
400
401        let storage = GraphStorage::new(root);
402        assert!(!has_fresh_analysis(&storage));
403    }
404
405    #[test]
406    fn has_fresh_analysis_false_when_one_scc_file_missing() {
407        let tmp = tempfile::tempdir().unwrap();
408        let root = tmp.path();
409
410        let hash = write_manifest(root, r#"{"version":"1.0"}"#);
411        write_analysis_files(root, &hash);
412
413        // Remove one SCC file to simulate partial corruption.
414        let storage = GraphStorage::new(root);
415        std::fs::remove_file(storage.analysis_scc_path("imports")).unwrap();
416
417        assert!(!has_fresh_analysis(&storage));
418    }
419
420    #[test]
421    fn has_fresh_analysis_false_when_one_cond_file_missing() {
422        let tmp = tempfile::tempdir().unwrap();
423        let root = tmp.path();
424
425        let hash = write_manifest(root, r#"{"version":"1.0"}"#);
426        write_analysis_files(root, &hash);
427
428        // Remove one condensation file.
429        let storage = GraphStorage::new(root);
430        std::fs::remove_file(storage.analysis_cond_path("references")).unwrap();
431
432        assert!(!has_fresh_analysis(&storage));
433    }
434
435    // ========================================================================
436    // collect_analysis_stats tests
437    // ========================================================================
438
439    #[test]
440    fn collect_analysis_stats_populated() {
441        let calls_kind = EdgeKind::Calls {
442            argument_count: 0,
443            is_async: false,
444            resolved_via: ResolvedVia::Direct,
445        };
446        let imports_kind = EdgeKind::Imports {
447            alias: None,
448            is_wildcard: false,
449        };
450
451        let analyses = GraphAnalyses {
452            adjacency: CsrAdjacency {
453                node_count: 42,
454                edge_count: 100,
455                row_offsets: vec![],
456                col_indices: vec![],
457                edge_kinds: vec![],
458            },
459            scc_calls: make_scc(calls_kind.clone(), 10),
460            scc_imports: make_scc(imports_kind.clone(), 5),
461            scc_references: make_scc(EdgeKind::References, 3),
462            scc_inherits: make_scc(EdgeKind::Inherits, 0),
463            cond_calls: make_cond(calls_kind),
464            cond_imports: make_cond(imports_kind),
465            cond_references: make_cond(EdgeKind::References),
466            cond_inherits: make_cond(EdgeKind::Inherits),
467        };
468
469        let duration = Duration::from_millis(1234);
470        let stats = collect_analysis_stats(&analyses, duration);
471
472        assert_eq!(stats.node_count, 42);
473        assert_eq!(stats.edge_count, 100);
474        assert_eq!(stats.scc_stats.len(), 4);
475
476        // Verify each edge kind is represented correctly.
477        assert_eq!(stats.scc_stats[0].edge_kind, "calls");
478        assert_eq!(stats.scc_stats[0].scc_count, 10);
479        assert_eq!(stats.scc_stats[0].non_trivial_count, 1);
480        assert_eq!(stats.scc_stats[0].max_scc_size, 3);
481
482        assert_eq!(stats.scc_stats[1].edge_kind, "imports");
483        assert_eq!(stats.scc_stats[1].scc_count, 5);
484
485        assert_eq!(stats.scc_stats[2].edge_kind, "references");
486        assert_eq!(stats.scc_stats[2].scc_count, 3);
487
488        assert_eq!(stats.scc_stats[3].edge_kind, "inherits");
489        assert_eq!(stats.scc_stats[3].scc_count, 0);
490        assert_eq!(stats.scc_stats[3].non_trivial_count, 0);
491        assert_eq!(stats.scc_stats[3].max_scc_size, 1);
492
493        // Build time should be faithfully captured.
494        #[allow(clippy::float_cmp)]
495        {
496            assert_eq!(stats.build_time_secs, 1.234);
497        }
498    }
499
500    // ========================================================================
501    // format_stats_text tests
502    // ========================================================================
503
504    #[test]
505    fn format_stats_text_contains_expected_labels() {
506        let calls_kind = EdgeKind::Calls {
507            argument_count: 0,
508            is_async: false,
509            resolved_via: ResolvedVia::Direct,
510        };
511        let imports_kind = EdgeKind::Imports {
512            alias: None,
513            is_wildcard: false,
514        };
515        let stats = AnalysisStats {
516            node_count: 10,
517            edge_count: 20,
518            build_time_secs: 0.5,
519            scc_stats: vec![
520                SccStats {
521                    edge_kind: "calls".to_string(),
522                    scc_count: 3,
523                    non_trivial_count: 1,
524                    max_scc_size: 5,
525                },
526                SccStats {
527                    edge_kind: "imports".to_string(),
528                    scc_count: 2,
529                    non_trivial_count: 0,
530                    max_scc_size: 1,
531                },
532            ],
533        };
534
535        let tmp = tempfile::tempdir().unwrap();
536        let analysis_dir = tmp.path().join("analysis");
537        std::fs::create_dir_all(&analysis_dir).unwrap();
538
539        let output = format_stats_text(&stats, &analysis_dir);
540
541        assert!(
542            output.contains("Graph analysis complete"),
543            "Expected completion marker: {output}"
544        );
545        assert!(output.contains("10 nodes"), "Expected node count: {output}");
546        assert!(output.contains("20 edges"), "Expected edge count: {output}");
547        assert!(output.contains("0.50s"), "Expected build time: {output}");
548        assert!(
549            output.contains("calls"),
550            "Expected calls SCC stats: {output}"
551        );
552        assert!(
553            output.contains("imports"),
554            "Expected imports SCC stats: {output}"
555        );
556        assert!(output.contains("3 SCCs"), "Expected SCC count: {output}");
557        assert!(
558            output.contains("max size: 5"),
559            "Expected max SCC size: {output}"
560        );
561        assert!(
562            output.contains(analysis_dir.to_string_lossy().as_ref()),
563            "Expected analysis dir path: {output}"
564        );
565        // Suppress unused variable warnings in test helper context
566        let _ = calls_kind;
567        let _ = imports_kind;
568    }
569
570    #[test]
571    fn format_stats_text_empty_scc_stats() {
572        let stats = AnalysisStats {
573            node_count: 0,
574            edge_count: 0,
575            build_time_secs: 0.0,
576            scc_stats: vec![],
577        };
578        let tmp = tempfile::tempdir().unwrap();
579        let output = format_stats_text(&stats, tmp.path());
580
581        assert!(
582            output.contains("Graph analysis complete"),
583            "Missing header: {output}"
584        );
585        assert!(output.contains("0 nodes"), "Expected 0 nodes: {output}");
586        assert!(output.contains("0 edges"), "Expected 0 edges: {output}");
587    }
588
589    #[test]
590    fn collect_analysis_stats_empty_graph() {
591        let calls_kind = EdgeKind::Calls {
592            argument_count: 0,
593            is_async: false,
594            resolved_via: ResolvedVia::Direct,
595        };
596        let imports_kind = EdgeKind::Imports {
597            alias: None,
598            is_wildcard: false,
599        };
600
601        let empty_scc = |kind: EdgeKind| SccData {
602            edge_kind: kind,
603            node_count: 0,
604            scc_count: 0,
605            non_trivial_count: 0,
606            max_scc_size: 0,
607            node_to_scc: vec![],
608            scc_offsets: vec![0],
609            scc_members: vec![],
610            has_self_loop: vec![],
611        };
612
613        let analyses = GraphAnalyses {
614            adjacency: CsrAdjacency {
615                node_count: 0,
616                edge_count: 0,
617                row_offsets: vec![0],
618                col_indices: vec![],
619                edge_kinds: vec![],
620            },
621            scc_calls: empty_scc(calls_kind.clone()),
622            scc_imports: empty_scc(imports_kind.clone()),
623            scc_references: empty_scc(EdgeKind::References),
624            scc_inherits: empty_scc(EdgeKind::Inherits),
625            cond_calls: make_cond(calls_kind),
626            cond_imports: make_cond(imports_kind),
627            cond_references: make_cond(EdgeKind::References),
628            cond_inherits: make_cond(EdgeKind::Inherits),
629        };
630
631        let duration = Duration::from_secs(0);
632        let stats = collect_analysis_stats(&analyses, duration);
633
634        assert_eq!(stats.node_count, 0);
635        assert_eq!(stats.edge_count, 0);
636        for scc_stat in &stats.scc_stats {
637            assert_eq!(scc_stat.scc_count, 0);
638            assert_eq!(scc_stat.non_trivial_count, 0);
639            assert_eq!(scc_stat.max_scc_size, 0);
640        }
641    }
642}