Skip to main content

sqry_core/query/executor/
graph_duplicates.rs

1//! Duplicate detection for unified graph.
2//!
3//! This module provides duplicate code detection using the unified graph API,
4//! replacing the legacy index-based duplicate detection.
5
6use crate::graph::body_hash::BodyHash128;
7use crate::graph::unified::concurrent::CodeGraph;
8use crate::graph::unified::node::{NodeId, NodeKind};
9use std::collections::HashMap;
10use std::hash::{Hash, Hasher};
11use std::str::FromStr;
12
13/// Type of duplicate detection
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum DuplicateType {
16    /// Functions with identical/similar bodies (based on signature when body unavailable)
17    Body,
18    /// Functions with identical signatures (return type only for now)
19    Signature,
20    /// Structs with similar field layouts
21    Struct,
22}
23
24impl DuplicateType {
25    /// Parse duplicate type from a query value string.
26    #[must_use]
27    pub fn parse(value: &str) -> Option<Self> {
28        value.parse().ok()
29    }
30}
31
32impl FromStr for DuplicateType {
33    type Err = anyhow::Error;
34
35    fn from_str(value: &str) -> Result<Self, Self::Err> {
36        match value.trim().to_lowercase().as_str() {
37            "body" | "function" => Ok(Self::Body),
38            "signature" | "sig" => Ok(Self::Signature),
39            "struct" | "type" => Ok(Self::Struct),
40            _ => Err(anyhow::anyhow!("Unknown duplicate type: {value}")),
41        }
42    }
43}
44
45/// Configuration for duplicate detection
46#[derive(Debug, Clone)]
47pub struct DuplicateConfig {
48    /// Minimum similarity threshold (0.0 - 1.0)
49    pub threshold: f64,
50    /// Maximum results to return
51    #[allow(dead_code)]
52    pub max_results: usize,
53    /// Include exact duplicates only
54    pub is_exact_only: bool,
55}
56
57impl Default for DuplicateConfig {
58    fn default() -> Self {
59        Self {
60            threshold: 0.8,
61            max_results: 1000,
62            is_exact_only: false,
63        }
64    }
65}
66
67/// A group of duplicate symbols
68#[derive(Debug, Clone)]
69pub struct DuplicateGroup {
70    /// Hash identifying this group (64-bit, for backward compatibility)
71    pub hash: u64,
72    /// Full 128-bit body hash (only set for body duplicate groups)
73    ///
74    /// This is used for proper hex string output in CLI/MCP.
75    /// When set, this is the actual body hash from the indexed nodes.
76    pub body_hash_128: Option<BodyHash128>,
77    /// Node IDs of duplicates in this group
78    pub node_ids: Vec<NodeId>,
79}
80
81/// Compute a hash for duplicate detection based on type
82fn compute_hash(graph: &CodeGraph, node_id: NodeId, dup_type: DuplicateType) -> Option<u64> {
83    let entry = graph.nodes().get(node_id)?;
84    let strings = graph.strings();
85
86    match dup_type {
87        DuplicateType::Body => {
88            // Primary: use the precomputed body_hash from the NodeEntry
89            // This is the 128-bit hash computed from actual body bytes during indexing
90            if let Some(body_hash) = entry.body_hash {
91                // Convert u128 to u64 by XOR-folding the two halves
92                // This preserves collision resistance for grouping purposes
93                return Some(body_hash.high ^ body_hash.low);
94            }
95
96            // Fallback for nodes without body_hash: use signature if available
97            if let Some(sig_id) = entry.signature
98                && let Some(sig) = strings.resolve(sig_id)
99            {
100                let mut hasher = std::collections::hash_map::DefaultHasher::new();
101                sig.hash(&mut hasher);
102                entry.kind.hash(&mut hasher);
103                return Some(hasher.finish());
104            }
105
106            // Last resort: hash qualified name + kind + line span (approximates body size)
107            let name = entry
108                .qualified_name
109                .and_then(|id| strings.resolve(id))
110                .or_else(|| strings.resolve(entry.name))?;
111
112            let mut hasher = std::collections::hash_map::DefaultHasher::new();
113            name.hash(&mut hasher);
114            entry.kind.hash(&mut hasher);
115            // Include line span as proxy for body size
116            let lines = entry.end_line.saturating_sub(entry.start_line);
117            lines.hash(&mut hasher);
118            Some(hasher.finish())
119        }
120        DuplicateType::Signature => {
121            // Hash signature if available, otherwise name + kind
122            if let Some(sig_id) = entry.signature
123                && let Some(sig) = strings.resolve(sig_id)
124            {
125                let mut hasher = std::collections::hash_map::DefaultHasher::new();
126                sig.hash(&mut hasher);
127                return Some(hasher.finish());
128            }
129
130            // Fallback: hash name + kind
131            let name = strings.resolve(entry.name)?;
132            let mut hasher = std::collections::hash_map::DefaultHasher::new();
133            name.hash(&mut hasher);
134            entry.kind.hash(&mut hasher);
135            Some(hasher.finish())
136        }
137        DuplicateType::Struct => {
138            // Only consider structs/classes
139            if !matches!(entry.kind, NodeKind::Struct | NodeKind::Class) {
140                return None;
141            }
142
143            // Hash based on struct name and fields (using qualified name as proxy)
144            let name = entry
145                .qualified_name
146                .and_then(|id| strings.resolve(id))
147                .or_else(|| strings.resolve(entry.name))?;
148
149            let mut hasher = std::collections::hash_map::DefaultHasher::new();
150            name.hash(&mut hasher);
151            entry.kind.hash(&mut hasher);
152            Some(hasher.finish())
153        }
154    }
155}
156
157/// Find all duplicate groups in the graph.
158///
159/// Groups nodes by a hash computed from their metadata, based on the duplicate type:
160/// - `Body`: Hash includes kind, signature (or name + line span), for functions/methods
161/// - `Signature`: Hash includes only the signature string
162/// - `Struct`: Hash includes only the name, for structs/classes only
163///
164/// # Arguments
165///
166/// * `duplicate_type` - The type of duplication to detect (Body, Signature, or Struct)
167/// * `graph` - The code graph to analyze
168/// * `config` - Configuration for exact matching and result limits
169///
170/// # Returns
171///
172/// A vector of `DuplicateGroup` structs, each containing a list of node IDs
173/// that share the same hash. Groups are sorted by size (largest first) and
174/// limited by `config.max_results`. Single-node "groups" are filtered out.
175#[must_use]
176pub fn build_duplicate_groups_graph(
177    dup_type: DuplicateType,
178    graph: &CodeGraph,
179    config: &DuplicateConfig,
180) -> Vec<DuplicateGroup> {
181    let mut hash_to_nodes: HashMap<u64, Vec<NodeId>> = HashMap::new();
182
183    // Only consider relevant node kinds for each duplicate type
184    let relevant_kinds: Vec<NodeKind> = match dup_type {
185        DuplicateType::Body | DuplicateType::Signature => {
186            vec![NodeKind::Function, NodeKind::Method]
187        }
188        DuplicateType::Struct => vec![NodeKind::Struct, NodeKind::Class],
189    };
190
191    // Group nodes by hash
192    for (node_id, entry) in graph.nodes().iter() {
193        if !relevant_kinds.contains(&entry.kind) {
194            continue;
195        }
196
197        if let Some(hash) = compute_hash(graph, node_id, dup_type) {
198            hash_to_nodes.entry(hash).or_default().push(node_id);
199        }
200    }
201
202    // Filter to groups with duplicates and apply threshold
203    let mut groups: Vec<DuplicateGroup> = hash_to_nodes
204        .into_iter()
205        .filter(|(_, nodes)| {
206            if config.is_exact_only {
207                nodes.len() > 1
208            } else {
209                // For non-exact matching, we'd need fuzzy comparison
210                // For now, treat as exact matching
211                nodes.len() > 1
212            }
213        })
214        .map(|(hash, node_ids)| {
215            // For body duplicates, extract the full 128-bit body_hash from the first node
216            let body_hash_128 = if dup_type == DuplicateType::Body {
217                node_ids
218                    .first()
219                    .and_then(|&id| graph.nodes().get(id).and_then(|entry| entry.body_hash))
220            } else {
221                None
222            };
223
224            DuplicateGroup {
225                hash,
226                body_hash_128,
227                node_ids,
228            }
229        })
230        .collect();
231
232    // Sort by group size (largest first)
233    groups.sort_by(|a, b| b.node_ids.len().cmp(&a.node_ids.len()));
234
235    // Apply max_results limit
236    groups.truncate(config.max_results);
237
238    groups
239}
240
241#[cfg(test)]
242mod tests {
243    use super::*;
244    use crate::graph::unified::storage::arena::NodeEntry;
245    use std::path::Path;
246
247    /// Helper to create a test graph with nodes having specific signatures.
248    fn create_test_graph_with_signatures(
249        nodes: &[(&str, NodeKind, Option<&str>)],
250    ) -> (CodeGraph, Vec<NodeId>) {
251        let mut graph = CodeGraph::new();
252        let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
253        let mut node_ids = Vec::new();
254
255        for (name, kind, signature) in nodes {
256            let name_id = graph.strings_mut().intern(name).unwrap();
257            let mut entry = NodeEntry::new(*kind, name_id, file_id).with_qualified_name(name_id);
258
259            if let Some(sig) = signature {
260                let sig_id = graph.strings_mut().intern(sig).unwrap();
261                entry = entry.with_signature(sig_id);
262            }
263
264            let node_id = graph.nodes_mut().alloc(entry).unwrap();
265            node_ids.push(node_id);
266        }
267
268        (graph, node_ids)
269    }
270
271    /// Helper to create nodes with line spans (for body-based hashing fallback).
272    fn create_test_graph_with_spans(
273        nodes: &[(&str, NodeKind, u32, u32)], // name, kind, start_line, end_line
274    ) -> (CodeGraph, Vec<NodeId>) {
275        let mut graph = CodeGraph::new();
276        let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
277        let mut node_ids = Vec::new();
278
279        for (name, kind, start_line, end_line) in nodes {
280            let name_id = graph.strings_mut().intern(name).unwrap();
281            let entry = NodeEntry::new(*kind, name_id, file_id)
282                .with_qualified_name(name_id)
283                .with_location(*start_line, 0, *end_line, 0);
284
285            let node_id = graph.nodes_mut().alloc(entry).unwrap();
286            node_ids.push(node_id);
287        }
288
289        (graph, node_ids)
290    }
291
292    #[test]
293    fn test_empty_graph() {
294        let graph = CodeGraph::new();
295        let config = DuplicateConfig::default();
296
297        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
298        assert!(groups.is_empty());
299    }
300
301    #[test]
302    fn test_no_duplicates_unique_signatures() {
303        // Three functions with unique signatures - no duplicates
304        let nodes = [
305            ("func_a", NodeKind::Function, Some("fn func_a() -> i32")),
306            ("func_b", NodeKind::Function, Some("fn func_b() -> String")),
307            (
308                "func_c",
309                NodeKind::Function,
310                Some("fn func_c(x: i32) -> bool"),
311            ),
312        ];
313        let (graph, _) = create_test_graph_with_signatures(&nodes);
314        let config = DuplicateConfig::default();
315
316        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
317        assert!(groups.is_empty(), "No duplicates should be found");
318    }
319
320    #[test]
321    fn test_signature_duplicates() {
322        // Two functions with identical signatures
323        let nodes = [
324            (
325                "func_a",
326                NodeKind::Function,
327                Some("fn process(x: i32) -> i32"),
328            ),
329            (
330                "func_b",
331                NodeKind::Function,
332                Some("fn process(x: i32) -> i32"),
333            ),
334            ("func_c", NodeKind::Function, Some("fn other() -> String")),
335        ];
336        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
337        let config = DuplicateConfig::default();
338
339        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
340        assert_eq!(groups.len(), 1, "Should find one duplicate group");
341        assert_eq!(
342            groups[0].node_ids.len(),
343            2,
344            "Group should have 2 duplicates"
345        );
346        assert!(groups[0].node_ids.contains(&node_ids[0]));
347        assert!(groups[0].node_ids.contains(&node_ids[1]));
348    }
349
350    #[test]
351    fn test_body_duplicates_with_signatures() {
352        // Body duplicates are detected via signature + kind
353        let nodes = [
354            (
355                "func_a",
356                NodeKind::Function,
357                Some("fn compute(x: i32) -> i32"),
358            ),
359            (
360                "func_b",
361                NodeKind::Function,
362                Some("fn compute(x: i32) -> i32"),
363            ),
364            (
365                "func_c",
366                NodeKind::Method,
367                Some("fn compute(x: i32) -> i32"),
368            ), // Different kind
369        ];
370        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
371        let config = DuplicateConfig::default();
372
373        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
374        // func_a and func_b should be duplicates (same signature, same kind)
375        // func_c is Method, not Function, so different hash
376        assert_eq!(groups.len(), 1, "Should find one duplicate group");
377        assert_eq!(groups[0].node_ids.len(), 2);
378        assert!(groups[0].node_ids.contains(&node_ids[0]));
379        assert!(groups[0].node_ids.contains(&node_ids[1]));
380    }
381
382    #[test]
383    fn test_body_duplicates_fallback_to_name_and_span() {
384        // When no signature, use name + kind + line span
385        let nodes = [
386            ("helper", NodeKind::Function, 10, 20), // 10 lines
387            ("helper", NodeKind::Function, 30, 40), // 10 lines (same span size)
388            ("other", NodeKind::Function, 50, 60),  // Different name
389        ];
390        let (graph, node_ids) = create_test_graph_with_spans(&nodes);
391        let config = DuplicateConfig::default();
392
393        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
394        // Nodes with same name, kind, and span should be grouped
395        assert_eq!(groups.len(), 1, "Should find one duplicate group");
396        assert!(groups[0].node_ids.contains(&node_ids[0]));
397        assert!(groups[0].node_ids.contains(&node_ids[1]));
398    }
399
400    #[test]
401    fn test_struct_duplicates() {
402        // Only structs/classes are considered for struct duplicates
403        let nodes = [
404            ("MyStruct", NodeKind::Struct, None),
405            ("MyStruct", NodeKind::Struct, None),
406            ("MyStruct", NodeKind::Function, None), // Function, not struct - ignored
407            ("OtherStruct", NodeKind::Struct, None),
408        ];
409        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
410        let config = DuplicateConfig::default();
411
412        let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
413        // Only the two MyStruct nodes should be grouped
414        assert_eq!(groups.len(), 1, "Should find one duplicate group");
415        assert!(groups[0].node_ids.contains(&node_ids[0]));
416        assert!(groups[0].node_ids.contains(&node_ids[1]));
417        assert!(!groups[0].node_ids.contains(&node_ids[2])); // Function ignored
418    }
419
420    #[test]
421    fn test_class_duplicates() {
422        // Classes are also considered for struct duplicates
423        let nodes = [
424            ("MyClass", NodeKind::Class, None),
425            ("MyClass", NodeKind::Class, None),
426            ("OtherClass", NodeKind::Class, None),
427        ];
428        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
429        let config = DuplicateConfig::default();
430
431        let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
432        assert_eq!(groups.len(), 1);
433        assert!(groups[0].node_ids.contains(&node_ids[0]));
434        assert!(groups[0].node_ids.contains(&node_ids[1]));
435    }
436
437    #[test]
438    fn test_methods_ignored_for_struct_duplicates() {
439        // Methods should not be detected as struct duplicates
440        let nodes = [
441            ("process", NodeKind::Method, Some("fn process()")),
442            ("process", NodeKind::Method, Some("fn process()")),
443        ];
444        let (graph, _) = create_test_graph_with_signatures(&nodes);
445        let config = DuplicateConfig::default();
446
447        let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
448        assert!(
449            groups.is_empty(),
450            "Methods should not be considered for struct duplicates"
451        );
452    }
453
454    #[test]
455    fn test_max_results_limit() {
456        // Create many duplicate groups and verify limit is respected
457        let mut nodes = Vec::new();
458        for i in 0..10 {
459            // Each pair has same signature within group
460            let sig = format!("fn group{i}()");
461            nodes.push((format!("func_{i}_a"), NodeKind::Function, Some(sig.clone())));
462            nodes.push((format!("func_{i}_b"), NodeKind::Function, Some(sig)));
463        }
464
465        let nodes_ref: Vec<(&str, NodeKind, Option<&str>)> = nodes
466            .iter()
467            .map(|(name, kind, sig)| (name.as_str(), *kind, sig.as_deref()))
468            .collect();
469        let (graph, _) = create_test_graph_with_signatures(&nodes_ref);
470        let config = DuplicateConfig {
471            max_results: 3,
472            ..Default::default()
473        };
474
475        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
476        assert_eq!(groups.len(), 3, "Should respect max_results limit");
477    }
478
479    #[test]
480    fn test_groups_sorted_by_size() {
481        // Create groups of different sizes and verify they're sorted largest first
482        let nodes = [
483            // Group of 3
484            ("large_a", NodeKind::Function, Some("fn large()")),
485            ("large_b", NodeKind::Function, Some("fn large()")),
486            ("large_c", NodeKind::Function, Some("fn large()")),
487            // Group of 2
488            ("small_a", NodeKind::Function, Some("fn small()")),
489            ("small_b", NodeKind::Function, Some("fn small()")),
490        ];
491        let (graph, _) = create_test_graph_with_signatures(&nodes);
492        let config = DuplicateConfig::default();
493
494        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
495        assert_eq!(groups.len(), 2);
496        assert_eq!(groups[0].node_ids.len(), 3, "Largest group should be first");
497        assert_eq!(
498            groups[1].node_ids.len(),
499            2,
500            "Smaller group should be second"
501        );
502    }
503
504    #[test]
505    fn test_single_node_not_duplicate() {
506        // A single node is not a duplicate
507        let nodes = [
508            ("unique_func", NodeKind::Function, Some("fn unique()")),
509            ("other_func", NodeKind::Function, Some("fn different()")),
510        ];
511        let (graph, _) = create_test_graph_with_signatures(&nodes);
512        let config = DuplicateConfig::default();
513
514        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
515        assert!(
516            groups.is_empty(),
517            "Single nodes should not form duplicate groups"
518        );
519    }
520
521    #[test]
522    fn test_mixed_kinds_not_duplicates() {
523        // Same signature but different kinds should not be duplicates
524        let nodes = [
525            ("process", NodeKind::Function, Some("fn process()")),
526            ("process", NodeKind::Method, Some("fn process()")),
527        ];
528        let (graph, _) = create_test_graph_with_signatures(&nodes);
529        let config = DuplicateConfig::default();
530
531        // For Body type, kind matters in the hash
532        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
533        assert!(
534            groups.is_empty(),
535            "Different kinds should not be duplicates for body type"
536        );
537
538        // For Signature type, only signature matters
539        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
540        assert_eq!(
541            groups.len(),
542            1,
543            "Same signature should be duplicates regardless of kind"
544        );
545    }
546
547    #[test]
548    fn test_multiple_duplicate_groups() {
549        // Multiple independent duplicate groups
550        let nodes = [
551            ("func_a1", NodeKind::Function, Some("fn alpha()")),
552            ("func_a2", NodeKind::Function, Some("fn alpha()")),
553            ("func_b1", NodeKind::Function, Some("fn beta()")),
554            ("func_b2", NodeKind::Function, Some("fn beta()")),
555            ("unique", NodeKind::Function, Some("fn gamma()")),
556        ];
557        let (graph, _) = create_test_graph_with_signatures(&nodes);
558        let config = DuplicateConfig::default();
559
560        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
561        assert_eq!(groups.len(), 2, "Should find two duplicate groups");
562    }
563
564    #[test]
565    fn test_exact_only_config() {
566        // When exact_only is true, behavior should be the same (hash-based)
567        let nodes = [
568            ("func_a", NodeKind::Function, Some("fn exact()")),
569            ("func_b", NodeKind::Function, Some("fn exact()")),
570        ];
571        let (graph, _) = create_test_graph_with_signatures(&nodes);
572        let config = DuplicateConfig {
573            is_exact_only: true,
574            ..Default::default()
575        };
576
577        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
578        assert_eq!(groups.len(), 1);
579    }
580}