Skip to main content

sqry_cli/commands/
duplicates.rs

1//! Duplicates command implementation
2//!
3//! Provides CLI interface for finding duplicate code in the codebase.
4
5use crate::args::Cli;
6use crate::commands::graph::loader::{GraphLoadConfig, load_unified_graph_for_cli};
7use crate::index_discovery::find_nearest_index;
8use crate::output::OutputStreams;
9use anyhow::{Context, Result};
10use serde::Serialize;
11use sqry_core::query::{DuplicateConfig, DuplicateType, build_duplicate_groups_graph};
12
13/// Duplicate group for output
14#[derive(Debug, Serialize)]
15struct DuplicateGroupOutput {
16    /// Group identifier (hash as 32-char hex string for body duplicates, 16-char for others)
17    ///
18    /// For body duplicates with 128-bit `body_hash`, this is formatted as a 32-character
19    /// lowercase hexadecimal string (e.g., "000000000000000012345678abcdef01").
20    /// For signature/struct duplicates, this is a 16-character hex string from the u64 hash.
21    group_id: String,
22    /// Number of duplicates in this group
23    count: usize,
24    /// Symbols in this group
25    symbols: Vec<DuplicateSymbol>,
26}
27
28/// Symbol info for duplicate output
29#[derive(Debug, Serialize)]
30struct DuplicateSymbol {
31    name: String,
32    qualified_name: String,
33    kind: String,
34    file: String,
35    line: u32,
36    language: String,
37}
38
39/// Run the duplicates command.
40///
41/// # Errors
42/// Returns an error if the graph cannot be loaded or duplicates cannot be found.
43pub fn run_duplicates(
44    cli: &Cli,
45    path: Option<&str>,
46    dup_type: &str,
47    threshold: u32,
48    max_results: usize,
49    exact: bool,
50) -> Result<()> {
51    let mut streams = OutputStreams::new();
52
53    // Parse duplicate type
54    let duplicate_type: DuplicateType = dup_type
55        .parse()
56        .with_context(|| format!("Invalid duplicate type: {dup_type}"))?;
57
58    // Find index
59    let search_path = path.map_or_else(
60        || std::env::current_dir().unwrap_or_default(),
61        std::path::PathBuf::from,
62    );
63
64    let index_location = find_nearest_index(&search_path);
65    let Some(ref loc) = index_location else {
66        streams
67            .write_diagnostic("No .sqry-index found. Run 'sqry index' first to build the index.")?;
68        return Ok(());
69    };
70
71    // Load unified graph
72    let graph_config = GraphLoadConfig::default();
73    let graph = load_unified_graph_for_cli(&loc.index_root, &graph_config, cli)
74        .context("Failed to load graph. Run 'sqry index' to build the graph.")?;
75
76    // Build config
77    let config = DuplicateConfig {
78        threshold: if exact {
79            1.0
80        } else {
81            f64::from(threshold) / 100.0
82        },
83        max_results,
84        is_exact_only: exact || threshold >= 100,
85        ..Default::default()
86    };
87
88    // Find duplicates using graph-based detection
89    let groups = build_duplicate_groups_graph(duplicate_type, &graph, &config);
90
91    let strings = graph.strings();
92    let files = graph.files();
93
94    // Convert to output format
95    let mut output_groups: Vec<DuplicateGroupOutput> = groups
96        .into_iter()
97        .filter(|g| g.node_ids.len() > 1)
98        .map(|group| {
99            let symbols: Vec<DuplicateSymbol> = group
100                .node_ids
101                .iter()
102                .filter_map(|&node_id| {
103                    let entry = graph.nodes().get(node_id)?;
104
105                    let name = strings
106                        .resolve(entry.name)
107                        .map(|s| s.to_string())
108                        .unwrap_or_default();
109
110                    let qualified_name = entry
111                        .qualified_name
112                        .and_then(|id| strings.resolve(id))
113                        .map_or_else(|| name.clone(), |s| s.to_string());
114
115                    let file_path = files
116                        .resolve(entry.file)
117                        .map(|p| p.display().to_string())
118                        .unwrap_or_default();
119
120                    let language = files
121                        .language_for_file(entry.file)
122                        .map_or_else(|| "Unknown".to_string(), |l| l.to_string());
123
124                    Some(DuplicateSymbol {
125                        name,
126                        qualified_name,
127                        kind: format!("{:?}", entry.kind),
128                        file: file_path,
129                        line: entry.start_line,
130                        language,
131                    })
132                })
133                .collect();
134
135            // Format group_id as hex string
136            // - For body duplicates with 128-bit hash: 32-char hex
137            // - For others: 16-char hex from u64
138            let group_id = if let Some(body_hash) = group.body_hash_128 {
139                format!("{body_hash}") // BodyHash128::Display is 32-char hex
140            } else {
141                format!("{:016x}", group.hash)
142            };
143
144            DuplicateGroupOutput {
145                group_id,
146                count: symbols.len(),
147                symbols,
148            }
149        })
150        .filter(|g| g.count > 1)
151        .collect();
152
153    // Sort by group size (largest first) for deterministic output
154    // Secondary sort by group_id string for stable ordering
155    output_groups.sort_by(|a, b| {
156        b.count
157            .cmp(&a.count)
158            .then_with(|| a.group_id.cmp(&b.group_id))
159    });
160    output_groups.truncate(max_results);
161
162    // Output
163    if cli.json {
164        let json =
165            serde_json::to_string_pretty(&output_groups).context("Failed to serialize to JSON")?;
166        streams.write_result(&json)?;
167    } else {
168        let output = format_duplicates_text(&output_groups, duplicate_type);
169        streams.write_result(&output)?;
170    }
171
172    Ok(())
173}
174
175/// Format duplicates as human-readable text
176fn format_duplicates_text(groups: &[DuplicateGroupOutput], dup_type: DuplicateType) -> String {
177    let mut lines = Vec::new();
178
179    let type_name = match dup_type {
180        DuplicateType::Body => "body",
181        DuplicateType::Signature => "signature",
182        DuplicateType::Struct => "struct",
183    };
184
185    lines.push(format!(
186        "Found {} duplicate groups (type: {})",
187        groups.len(),
188        type_name
189    ));
190    lines.push(String::new());
191
192    for (i, group) in groups.iter().enumerate() {
193        lines.push(format!("Group {} ({} duplicates):", i + 1, group.count));
194        for sym in &group.symbols {
195            lines.push(format!(
196                "  {} [{}] {}:{}",
197                sym.qualified_name, sym.kind, sym.file, sym.line
198            ));
199        }
200        lines.push(String::new());
201    }
202
203    if groups.is_empty() {
204        lines.push("No duplicates found.".to_string());
205    }
206
207    lines.join("\n")
208}