Skip to main content

sqry_core/query/executor/
graph_duplicates.rs

1//! Duplicate detection for unified graph.
2//!
3//! This module provides duplicate code detection using the unified graph API,
4//! replacing the legacy index-based duplicate detection.
5
6use crate::graph::body_hash::BodyHash128;
7use crate::graph::unified::concurrent::CodeGraph;
8use crate::graph::unified::node::{NodeId, NodeKind};
9use std::collections::HashMap;
10use std::hash::{Hash, Hasher};
11use std::str::FromStr;
12
13/// Type of duplicate detection
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum DuplicateType {
16    /// Functions with identical/similar bodies (based on signature when body unavailable)
17    Body,
18    /// Functions with identical signatures (return type only for now)
19    Signature,
20    /// Structs with similar field layouts
21    Struct,
22}
23
24impl DuplicateType {
25    /// Parse duplicate type from a query value string.
26    #[must_use]
27    pub fn parse(value: &str) -> Option<Self> {
28        value.parse().ok()
29    }
30}
31
32impl FromStr for DuplicateType {
33    type Err = anyhow::Error;
34
35    fn from_str(value: &str) -> Result<Self, Self::Err> {
36        match value.trim().to_lowercase().as_str() {
37            "body" | "function" => Ok(Self::Body),
38            "signature" | "sig" => Ok(Self::Signature),
39            "struct" | "type" => Ok(Self::Struct),
40            _ => Err(anyhow::anyhow!("Unknown duplicate type: {value}")),
41        }
42    }
43}
44
45/// Configuration for duplicate detection
46#[derive(Debug, Clone)]
47pub struct DuplicateConfig {
48    /// Minimum similarity threshold (0.0 - 1.0)
49    pub threshold: f64,
50    /// Maximum results to return
51    #[allow(dead_code)]
52    pub max_results: usize,
53    /// Include exact duplicates only
54    pub is_exact_only: bool,
55    /// Maximum number of member nodes to include per group (0 = unlimited).
56    ///
57    /// When a group has more members than this cap, `node_ids` is truncated to
58    /// `max_members_per_group` entries and `DuplicateGroup::members_truncated`
59    /// is set to `true`. The full pre-truncation count is always recorded in
60    /// `DuplicateGroup::total_members`.
61    ///
62    /// Default: 10.  Opt-out (pre-v9.1 behavior): set to 0.
63    pub max_members_per_group: usize,
64}
65
66impl Default for DuplicateConfig {
67    fn default() -> Self {
68        Self {
69            threshold: 0.8,
70            max_results: 1000,
71            is_exact_only: false,
72            max_members_per_group: 10,
73        }
74    }
75}
76
77/// A group of duplicate symbols
78#[derive(Debug, Clone)]
79pub struct DuplicateGroup {
80    /// Hash identifying this group (64-bit, for backward compatibility)
81    pub hash: u64,
82    /// Full 128-bit body hash (only set for body duplicate groups)
83    ///
84    /// This is used for proper hex string output in CLI/MCP.
85    /// When set, this is the actual body hash from the indexed nodes.
86    pub body_hash_128: Option<BodyHash128>,
87    /// Node IDs of duplicates in this group.
88    ///
89    /// When `members_truncated` is `true` this slice contains only the first
90    /// `DuplicateConfig::max_members_per_group` entries (sorted by file path
91    /// then node name).  The full pre-truncation count is in `total_members`.
92    pub node_ids: Vec<NodeId>,
93    /// Total number of members in this group **before** any per-group cap was
94    /// applied.  Always equal to `node_ids.len()` when `members_truncated` is
95    /// `false`.
96    pub total_members: usize,
97    /// `true` when `node_ids` was truncated by `DuplicateConfig::max_members_per_group`.
98    pub members_truncated: bool,
99}
100
101/// Compute a hash for duplicate detection based on type
102fn compute_hash(graph: &CodeGraph, node_id: NodeId, dup_type: DuplicateType) -> Option<u64> {
103    let entry = graph.nodes().get(node_id)?;
104    // Defense-in-depth: even if a caller forgets to filter unified
105    // losers at iteration time, refuse to compute a hash for one.
106    // See `NodeEntry::is_unified_loser`.
107    if entry.is_unified_loser() {
108        return None;
109    }
110    let strings = graph.strings();
111
112    match dup_type {
113        DuplicateType::Body => {
114            // Primary: use the precomputed body_hash from the NodeEntry
115            // This is the 128-bit hash computed from actual body bytes during indexing
116            if let Some(body_hash) = entry.body_hash {
117                // Convert u128 to u64 by XOR-folding the two halves
118                // This preserves collision resistance for grouping purposes
119                return Some(body_hash.high ^ body_hash.low);
120            }
121
122            // Fallback for nodes without body_hash: use signature if available
123            if let Some(sig_id) = entry.signature
124                && let Some(sig) = strings.resolve(sig_id)
125            {
126                let mut hasher = std::collections::hash_map::DefaultHasher::new();
127                sig.hash(&mut hasher);
128                entry.kind.hash(&mut hasher);
129                return Some(hasher.finish());
130            }
131
132            // Last resort: hash qualified name + kind + line span (approximates body size)
133            let name = entry
134                .qualified_name
135                .and_then(|id| strings.resolve(id))
136                .or_else(|| strings.resolve(entry.name))?;
137
138            let mut hasher = std::collections::hash_map::DefaultHasher::new();
139            name.hash(&mut hasher);
140            entry.kind.hash(&mut hasher);
141            // Include line span as proxy for body size
142            let lines = entry.end_line.saturating_sub(entry.start_line);
143            lines.hash(&mut hasher);
144            Some(hasher.finish())
145        }
146        DuplicateType::Signature => {
147            // Hash signature if available, otherwise name + kind
148            if let Some(sig_id) = entry.signature
149                && let Some(sig) = strings.resolve(sig_id)
150            {
151                let mut hasher = std::collections::hash_map::DefaultHasher::new();
152                sig.hash(&mut hasher);
153                return Some(hasher.finish());
154            }
155
156            // Fallback: hash name + kind
157            let name = strings.resolve(entry.name)?;
158            let mut hasher = std::collections::hash_map::DefaultHasher::new();
159            name.hash(&mut hasher);
160            entry.kind.hash(&mut hasher);
161            Some(hasher.finish())
162        }
163        DuplicateType::Struct => {
164            // Only consider structs/classes
165            if !matches!(entry.kind, NodeKind::Struct | NodeKind::Class) {
166                return None;
167            }
168
169            // Hash based on struct name and fields (using qualified name as proxy)
170            let name = entry
171                .qualified_name
172                .and_then(|id| strings.resolve(id))
173                .or_else(|| strings.resolve(entry.name))?;
174
175            let mut hasher = std::collections::hash_map::DefaultHasher::new();
176            name.hash(&mut hasher);
177            entry.kind.hash(&mut hasher);
178            Some(hasher.finish())
179        }
180    }
181}
182
183/// Find all duplicate groups in the graph.
184///
185/// Groups nodes by a hash computed from their metadata, based on the duplicate type:
186/// - `Body`: Hash includes kind, signature (or name + line span), for functions/methods
187/// - `Signature`: Hash includes only the signature string
188/// - `Struct`: Hash includes only the name, for structs/classes only
189///
190/// # Arguments
191///
192/// * `duplicate_type` - The type of duplication to detect (Body, Signature, or Struct)
193/// * `graph` - The code graph to analyze
194/// * `config` - Configuration for exact matching and result limits
195///
196/// # Returns
197///
198/// A vector of `DuplicateGroup` structs, each containing a list of node IDs
199/// that share the same hash. Groups are sorted by size (largest first) and
200/// limited by `config.max_results`. Single-node "groups" are filtered out.
201#[must_use]
202pub fn build_duplicate_groups_graph(
203    dup_type: DuplicateType,
204    graph: &CodeGraph,
205    config: &DuplicateConfig,
206) -> Vec<DuplicateGroup> {
207    let mut hash_to_nodes: HashMap<u64, Vec<NodeId>> = HashMap::new();
208
209    // Only consider relevant node kinds for each duplicate type
210    let relevant_kinds: Vec<NodeKind> = match dup_type {
211        DuplicateType::Body | DuplicateType::Signature => {
212            vec![NodeKind::Function, NodeKind::Method]
213        }
214        DuplicateType::Struct => vec![NodeKind::Struct, NodeKind::Class],
215    };
216
217    // Group nodes by hash.
218    //
219    // Gate 0d iter-2 blocker fix: skip Phase 4c-prime unified-away
220    // losers. They remain in the arena as inert duplicates so CSR
221    // row_ptr sizing stays stable, but publish-visible query
222    // evaluation (including `duplicates:`) MUST NOT surface them.
223    // `merge_node_into` now also clears `body_hash` and `signature`
224    // on losers as defense-in-depth, but the `is_unified_loser()`
225    // guard is the canonical exclusion check — keep both.
226    // See `NodeEntry::is_unified_loser` and
227    // `sqry-core/src/graph/unified/build/unification.rs`.
228    for (node_id, entry) in graph.nodes().iter() {
229        if entry.is_unified_loser() {
230            continue;
231        }
232        if !relevant_kinds.contains(&entry.kind) {
233            continue;
234        }
235
236        if let Some(hash) = compute_hash(graph, node_id, dup_type) {
237            hash_to_nodes.entry(hash).or_default().push(node_id);
238        }
239    }
240
241    // Filter to groups with duplicates and apply threshold
242    let mut groups: Vec<DuplicateGroup> = hash_to_nodes
243        .into_iter()
244        .filter(|(_, nodes)| {
245            if config.is_exact_only {
246                nodes.len() > 1
247            } else {
248                // For non-exact matching, we'd need fuzzy comparison
249                // For now, treat as exact matching
250                nodes.len() > 1
251            }
252        })
253        .map(|(hash, mut node_ids)| {
254            // For body duplicates, extract the full 128-bit body_hash from the first node.
255            // Skip unified losers defensively — `merge_node_into` clears
256            // `body_hash` on them so `entry.body_hash` should already be
257            // None, but the explicit guard keeps this site honest even if
258            // the clearing policy changes later.
259            let body_hash_128 = if dup_type == DuplicateType::Body {
260                node_ids.first().and_then(|&id| {
261                    graph.nodes().get(id).and_then(|entry| {
262                        if entry.is_unified_loser() {
263                            None
264                        } else {
265                            entry.body_hash
266                        }
267                    })
268                })
269            } else {
270                None
271            };
272
273            // Sort node_ids deterministically: by file path, then by node name,
274            // then by NodeId as a tiebreaker so ordering is fully stable even
275            // when two nodes share the same file and name.
276            let strings = graph.strings();
277            let files = graph.files();
278            node_ids.sort_by(|&a, &b| {
279                let path_a = graph
280                    .nodes()
281                    .get(a)
282                    .and_then(|e| files.resolve(e.file))
283                    .map(|p| p.to_string_lossy().into_owned())
284                    .unwrap_or_default();
285                let path_b = graph
286                    .nodes()
287                    .get(b)
288                    .and_then(|e| files.resolve(e.file))
289                    .map(|p| p.to_string_lossy().into_owned())
290                    .unwrap_or_default();
291                let name_a = graph
292                    .nodes()
293                    .get(a)
294                    .and_then(|e| strings.resolve(e.name))
295                    .as_deref()
296                    .map(str::to_owned)
297                    .unwrap_or_default();
298                let name_b = graph
299                    .nodes()
300                    .get(b)
301                    .and_then(|e| strings.resolve(e.name))
302                    .as_deref()
303                    .map(str::to_owned)
304                    .unwrap_or_default();
305                path_a
306                    .cmp(&path_b)
307                    .then_with(|| name_a.cmp(&name_b))
308                    .then_with(|| a.cmp(&b))
309            });
310
311            // Record the pre-truncation count and apply the per-group cap.
312            let total_members = node_ids.len();
313            let members_truncated =
314                config.max_members_per_group > 0 && node_ids.len() > config.max_members_per_group;
315            if members_truncated {
316                node_ids.truncate(config.max_members_per_group);
317            }
318
319            DuplicateGroup {
320                hash,
321                body_hash_128,
322                node_ids,
323                total_members,
324                members_truncated,
325            }
326        })
327        .collect();
328
329    // Sort by group size (largest first), using total_members so ordering
330    // reflects the full group even when members are truncated.
331    groups.sort_by(|a, b| b.total_members.cmp(&a.total_members));
332
333    // Apply max_results limit
334    groups.truncate(config.max_results);
335
336    groups
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342    use crate::graph::unified::storage::arena::NodeEntry;
343    use std::path::Path;
344
345    /// Helper to create a test graph with nodes having specific signatures.
346    fn create_test_graph_with_signatures(
347        nodes: &[(&str, NodeKind, Option<&str>)],
348    ) -> (CodeGraph, Vec<NodeId>) {
349        let mut graph = CodeGraph::new();
350        let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
351        let mut node_ids = Vec::new();
352
353        for (name, kind, signature) in nodes {
354            let name_id = graph.strings_mut().intern(name).unwrap();
355            let mut entry = NodeEntry::new(*kind, name_id, file_id).with_qualified_name(name_id);
356
357            if let Some(sig) = signature {
358                let sig_id = graph.strings_mut().intern(sig).unwrap();
359                entry = entry.with_signature(sig_id);
360            }
361
362            let node_id = graph.nodes_mut().alloc(entry).unwrap();
363            node_ids.push(node_id);
364        }
365
366        (graph, node_ids)
367    }
368
369    /// Helper to create nodes with line spans (for body-based hashing fallback).
370    fn create_test_graph_with_spans(
371        nodes: &[(&str, NodeKind, u32, u32)], // name, kind, start_line, end_line
372    ) -> (CodeGraph, Vec<NodeId>) {
373        let mut graph = CodeGraph::new();
374        let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
375        let mut node_ids = Vec::new();
376
377        for (name, kind, start_line, end_line) in nodes {
378            let name_id = graph.strings_mut().intern(name).unwrap();
379            let entry = NodeEntry::new(*kind, name_id, file_id)
380                .with_qualified_name(name_id)
381                .with_location(*start_line, 0, *end_line, 0);
382
383            let node_id = graph.nodes_mut().alloc(entry).unwrap();
384            node_ids.push(node_id);
385        }
386
387        (graph, node_ids)
388    }
389
390    #[test]
391    fn test_empty_graph() {
392        let graph = CodeGraph::new();
393        let config = DuplicateConfig::default();
394
395        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
396        assert!(groups.is_empty());
397    }
398
399    #[test]
400    fn test_no_duplicates_unique_signatures() {
401        // Three functions with unique signatures - no duplicates
402        let nodes = [
403            ("func_a", NodeKind::Function, Some("fn func_a() -> i32")),
404            ("func_b", NodeKind::Function, Some("fn func_b() -> String")),
405            (
406                "func_c",
407                NodeKind::Function,
408                Some("fn func_c(x: i32) -> bool"),
409            ),
410        ];
411        let (graph, _) = create_test_graph_with_signatures(&nodes);
412        let config = DuplicateConfig::default();
413
414        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
415        assert!(groups.is_empty(), "No duplicates should be found");
416    }
417
418    #[test]
419    fn test_signature_duplicates() {
420        // Two functions with identical signatures
421        let nodes = [
422            (
423                "func_a",
424                NodeKind::Function,
425                Some("fn process(x: i32) -> i32"),
426            ),
427            (
428                "func_b",
429                NodeKind::Function,
430                Some("fn process(x: i32) -> i32"),
431            ),
432            ("func_c", NodeKind::Function, Some("fn other() -> String")),
433        ];
434        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
435        let config = DuplicateConfig::default();
436
437        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
438        assert_eq!(groups.len(), 1, "Should find one duplicate group");
439        assert_eq!(
440            groups[0].node_ids.len(),
441            2,
442            "Group should have 2 duplicates"
443        );
444        assert!(groups[0].node_ids.contains(&node_ids[0]));
445        assert!(groups[0].node_ids.contains(&node_ids[1]));
446    }
447
448    #[test]
449    fn test_body_duplicates_with_signatures() {
450        // Body duplicates are detected via signature + kind
451        let nodes = [
452            (
453                "func_a",
454                NodeKind::Function,
455                Some("fn compute(x: i32) -> i32"),
456            ),
457            (
458                "func_b",
459                NodeKind::Function,
460                Some("fn compute(x: i32) -> i32"),
461            ),
462            (
463                "func_c",
464                NodeKind::Method,
465                Some("fn compute(x: i32) -> i32"),
466            ), // Different kind
467        ];
468        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
469        let config = DuplicateConfig::default();
470
471        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
472        // func_a and func_b should be duplicates (same signature, same kind)
473        // func_c is Method, not Function, so different hash
474        assert_eq!(groups.len(), 1, "Should find one duplicate group");
475        assert_eq!(groups[0].node_ids.len(), 2);
476        assert!(groups[0].node_ids.contains(&node_ids[0]));
477        assert!(groups[0].node_ids.contains(&node_ids[1]));
478    }
479
480    #[test]
481    fn test_body_duplicates_fallback_to_name_and_span() {
482        // When no signature, use name + kind + line span
483        let nodes = [
484            ("helper", NodeKind::Function, 10, 20), // 10 lines
485            ("helper", NodeKind::Function, 30, 40), // 10 lines (same span size)
486            ("other", NodeKind::Function, 50, 60),  // Different name
487        ];
488        let (graph, node_ids) = create_test_graph_with_spans(&nodes);
489        let config = DuplicateConfig::default();
490
491        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
492        // Nodes with same name, kind, and span should be grouped
493        assert_eq!(groups.len(), 1, "Should find one duplicate group");
494        assert!(groups[0].node_ids.contains(&node_ids[0]));
495        assert!(groups[0].node_ids.contains(&node_ids[1]));
496    }
497
498    #[test]
499    fn test_struct_duplicates() {
500        // Only structs/classes are considered for struct duplicates
501        let nodes = [
502            ("MyStruct", NodeKind::Struct, None),
503            ("MyStruct", NodeKind::Struct, None),
504            ("MyStruct", NodeKind::Function, None), // Function, not struct - ignored
505            ("OtherStruct", NodeKind::Struct, None),
506        ];
507        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
508        let config = DuplicateConfig::default();
509
510        let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
511        // Only the two MyStruct nodes should be grouped
512        assert_eq!(groups.len(), 1, "Should find one duplicate group");
513        assert!(groups[0].node_ids.contains(&node_ids[0]));
514        assert!(groups[0].node_ids.contains(&node_ids[1]));
515        assert!(!groups[0].node_ids.contains(&node_ids[2])); // Function ignored
516    }
517
518    #[test]
519    fn test_class_duplicates() {
520        // Classes are also considered for struct duplicates
521        let nodes = [
522            ("MyClass", NodeKind::Class, None),
523            ("MyClass", NodeKind::Class, None),
524            ("OtherClass", NodeKind::Class, None),
525        ];
526        let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
527        let config = DuplicateConfig::default();
528
529        let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
530        assert_eq!(groups.len(), 1);
531        assert!(groups[0].node_ids.contains(&node_ids[0]));
532        assert!(groups[0].node_ids.contains(&node_ids[1]));
533    }
534
535    #[test]
536    fn test_methods_ignored_for_struct_duplicates() {
537        // Methods should not be detected as struct duplicates
538        let nodes = [
539            ("process", NodeKind::Method, Some("fn process()")),
540            ("process", NodeKind::Method, Some("fn process()")),
541        ];
542        let (graph, _) = create_test_graph_with_signatures(&nodes);
543        let config = DuplicateConfig::default();
544
545        let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
546        assert!(
547            groups.is_empty(),
548            "Methods should not be considered for struct duplicates"
549        );
550    }
551
552    #[test]
553    fn test_max_results_limit() {
554        // Create many duplicate groups and verify limit is respected
555        let mut nodes = Vec::new();
556        for i in 0..10 {
557            // Each pair has same signature within group
558            let sig = format!("fn group{i}()");
559            nodes.push((format!("func_{i}_a"), NodeKind::Function, Some(sig.clone())));
560            nodes.push((format!("func_{i}_b"), NodeKind::Function, Some(sig)));
561        }
562
563        let nodes_ref: Vec<(&str, NodeKind, Option<&str>)> = nodes
564            .iter()
565            .map(|(name, kind, sig)| (name.as_str(), *kind, sig.as_deref()))
566            .collect();
567        let (graph, _) = create_test_graph_with_signatures(&nodes_ref);
568        let config = DuplicateConfig {
569            max_results: 3,
570            ..Default::default()
571        };
572
573        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
574        assert_eq!(groups.len(), 3, "Should respect max_results limit");
575    }
576
577    #[test]
578    fn test_groups_sorted_by_size() {
579        // Create groups of different sizes and verify they're sorted largest first
580        let nodes = [
581            // Group of 3
582            ("large_a", NodeKind::Function, Some("fn large()")),
583            ("large_b", NodeKind::Function, Some("fn large()")),
584            ("large_c", NodeKind::Function, Some("fn large()")),
585            // Group of 2
586            ("small_a", NodeKind::Function, Some("fn small()")),
587            ("small_b", NodeKind::Function, Some("fn small()")),
588        ];
589        let (graph, _) = create_test_graph_with_signatures(&nodes);
590        let config = DuplicateConfig::default();
591
592        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
593        assert_eq!(groups.len(), 2);
594        assert_eq!(groups[0].node_ids.len(), 3, "Largest group should be first");
595        assert_eq!(
596            groups[1].node_ids.len(),
597            2,
598            "Smaller group should be second"
599        );
600    }
601
602    #[test]
603    fn test_single_node_not_duplicate() {
604        // A single node is not a duplicate
605        let nodes = [
606            ("unique_func", NodeKind::Function, Some("fn unique()")),
607            ("other_func", NodeKind::Function, Some("fn different()")),
608        ];
609        let (graph, _) = create_test_graph_with_signatures(&nodes);
610        let config = DuplicateConfig::default();
611
612        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
613        assert!(
614            groups.is_empty(),
615            "Single nodes should not form duplicate groups"
616        );
617    }
618
619    #[test]
620    fn test_mixed_kinds_not_duplicates() {
621        // Same signature but different kinds should not be duplicates
622        let nodes = [
623            ("process", NodeKind::Function, Some("fn process()")),
624            ("process", NodeKind::Method, Some("fn process()")),
625        ];
626        let (graph, _) = create_test_graph_with_signatures(&nodes);
627        let config = DuplicateConfig::default();
628
629        // For Body type, kind matters in the hash
630        let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
631        assert!(
632            groups.is_empty(),
633            "Different kinds should not be duplicates for body type"
634        );
635
636        // For Signature type, only signature matters
637        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
638        assert_eq!(
639            groups.len(),
640            1,
641            "Same signature should be duplicates regardless of kind"
642        );
643    }
644
645    #[test]
646    fn test_multiple_duplicate_groups() {
647        // Multiple independent duplicate groups
648        let nodes = [
649            ("func_a1", NodeKind::Function, Some("fn alpha()")),
650            ("func_a2", NodeKind::Function, Some("fn alpha()")),
651            ("func_b1", NodeKind::Function, Some("fn beta()")),
652            ("func_b2", NodeKind::Function, Some("fn beta()")),
653            ("unique", NodeKind::Function, Some("fn gamma()")),
654        ];
655        let (graph, _) = create_test_graph_with_signatures(&nodes);
656        let config = DuplicateConfig::default();
657
658        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
659        assert_eq!(groups.len(), 2, "Should find two duplicate groups");
660    }
661
662    #[test]
663    fn test_exact_only_config() {
664        // When exact_only is true, behavior should be the same (hash-based)
665        let nodes = [
666            ("func_a", NodeKind::Function, Some("fn exact()")),
667            ("func_b", NodeKind::Function, Some("fn exact()")),
668        ];
669        let (graph, _) = create_test_graph_with_signatures(&nodes);
670        let config = DuplicateConfig {
671            is_exact_only: true,
672            ..Default::default()
673        };
674
675        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
676        assert_eq!(groups.len(), 1);
677    }
678
679    /// Gate 0d iter-2 regression: the public `duplicates:` query path
680    /// MUST skip Phase 4c-prime unified losers. Before the fix,
681    /// `graph_duplicates.rs:192` walked every arena entry and its
682    /// hash path keyed on `entry.body_hash` / `entry.signature`, both
683    /// of which remained on the loser post-merge. This test simulates
684    /// that exact shape:
685    ///
686    /// 1. Winner + loser with identical `body_hash` in different files.
687    /// 2. Unify via `merge_node_into`.
688    /// 3. Run `build_duplicate_groups_graph` with `DuplicateType::Body`.
689    /// 4. Assert the loser is NOT a member of any returned group, and
690    ///    that no winner-only group with only one member appears
691    ///    (single-node groups are filtered).
692    #[test]
693    fn duplicates_query_excludes_unified_losers() {
694        use crate::graph::body_hash::BodyHash128;
695        use crate::graph::unified::build::unification::merge_node_into;
696
697        let mut graph = CodeGraph::new();
698
699        let winner_name = graph.strings_mut().intern("shared_fn").unwrap();
700        let winner_qn = graph.strings_mut().intern("mod_a::shared_fn").unwrap();
701        let sig = graph.strings_mut().intern("fn shared_fn() -> ()").unwrap();
702        let body = BodyHash128 {
703            high: 0xDEAD_BEEF,
704            low: 0xCAFE_F00D,
705        };
706
707        let file_a = graph
708            .files_mut()
709            .register(Path::new("src/a.rs"))
710            .expect("register a");
711        let file_b = graph
712            .files_mut()
713            .register(Path::new("src/b.rs"))
714            .expect("register b");
715
716        let (winner_id, loser_id) = {
717            let arena = graph.nodes_mut();
718            let mut w = NodeEntry::new(NodeKind::Function, winner_name, file_a)
719                .with_location(10, 0, 20, 0)
720                .with_signature(sig)
721                .with_body_hash(body);
722            w.qualified_name = Some(winner_qn);
723            let w_id = arena.alloc(w).unwrap();
724
725            let mut l = NodeEntry::new(NodeKind::Function, winner_name, file_b)
726                .with_location(5, 0, 15, 0)
727                .with_signature(sig)
728                .with_body_hash(body);
729            l.qualified_name = Some(winner_qn);
730            let l_id = arena.alloc(l).unwrap();
731
732            (w_id, l_id)
733        };
734        graph.files_mut().record_node(file_a, winner_id);
735        graph.files_mut().record_node(file_b, loser_id);
736
737        // Perform the unification merge (same primitive Phase 4c-prime
738        // calls). The iter-2 fix additionally clears signature,
739        // body_hash, doc, visibility — so the loser's hash inputs
740        // should all be None post-merge.
741        merge_node_into(graph.nodes_mut(), loser_id, winner_id).unwrap();
742
743        // Sanity: the loser is marked unified, and its content-
744        // addressable metadata is cleared per iter-2 fix.
745        let loser_entry = graph.nodes().get(loser_id).expect("loser present");
746        assert!(loser_entry.is_unified_loser());
747        assert!(loser_entry.body_hash.is_none(), "body_hash must be cleared");
748        assert!(loser_entry.signature.is_none(), "signature must be cleared");
749
750        graph.rebuild_indices();
751
752        // The duplicates-by-body query is the publish-visible CD
753        // predicate that was leaking losers in iter-2.
754        let groups =
755            build_duplicate_groups_graph(DuplicateType::Body, &graph, &DuplicateConfig::default());
756
757        // The loser must NOT appear in any returned group. With only
758        // one winner and the loser filtered, there's no "group of 2"
759        // to return, so the result set should be empty.
760        for group in &groups {
761            assert!(
762                !group.node_ids.contains(&loser_id),
763                "Unified loser leaked into `duplicates:body` group: {:?}",
764                group.node_ids
765            );
766        }
767        assert!(
768            groups.is_empty(),
769            "After filtering losers, the winner is alone and no duplicate group should \
770             be returned; got {} groups: {groups:?}",
771            groups.len(),
772        );
773
774        // Also verify the signature duplicate path is free of losers.
775        let sig_groups = build_duplicate_groups_graph(
776            DuplicateType::Signature,
777            &graph,
778            &DuplicateConfig::default(),
779        );
780        for group in &sig_groups {
781            assert!(
782                !group.node_ids.contains(&loser_id),
783                "Unified loser leaked into `duplicates:signature` group: {:?}",
784                group.node_ids
785            );
786        }
787        assert!(
788            sig_groups.is_empty(),
789            "After filtering losers, the winner is alone and no signature duplicate \
790             group should be returned; got {} groups",
791            sig_groups.len(),
792        );
793    }
794
795    // -------------------------------------------------------------------------
796    // DUP_1 — per-group member cap tests
797    // -------------------------------------------------------------------------
798
799    /// Create a graph where `member_count` nodes share the same signature so they
800    /// all land in one duplicate group.  Nodes are spread across two files so the
801    /// deterministic-ordering test has a non-trivial sort key.
802    fn create_large_group_graph(member_count: usize) -> (CodeGraph, Vec<NodeId>) {
803        let mut graph = CodeGraph::new();
804        let file_a = graph.files_mut().register(Path::new("src/a.rs")).unwrap();
805        let file_b = graph.files_mut().register(Path::new("src/b.rs")).unwrap();
806        let shared_sig = graph
807            .strings_mut()
808            .intern("fn duplicate_fn() -> i32")
809            .unwrap();
810        let mut node_ids = Vec::new();
811
812        for i in 0..member_count {
813            let name = format!("dup_fn_{i}");
814            let name_id = graph.strings_mut().intern(&name).unwrap();
815            // Alternate between the two files to make ordering non-trivial.
816            let file_id = if i % 2 == 0 { file_a } else { file_b };
817            let entry = NodeEntry::new(NodeKind::Function, name_id, file_id)
818                .with_qualified_name(name_id)
819                .with_signature(shared_sig);
820            let node_id = graph.nodes_mut().alloc(entry).unwrap();
821            node_ids.push(node_id);
822        }
823
824        (graph, node_ids)
825    }
826
827    /// Truncation: `max_members_per_group = 3` on a group with 10 members.
828    /// Expects: `node_ids.len() == 3`, `total_members == 10`,
829    /// `members_truncated == true`.
830    #[test]
831    fn test_per_group_cap_truncation() {
832        let (graph, _) = create_large_group_graph(10);
833        let config = DuplicateConfig {
834            max_members_per_group: 3,
835            ..Default::default()
836        };
837
838        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
839        assert_eq!(groups.len(), 1, "Expected exactly one duplicate group");
840
841        let group = &groups[0];
842        assert_eq!(
843            group.total_members, 10,
844            "total_members must be pre-truncation count"
845        );
846        assert!(group.members_truncated, "members_truncated must be true");
847        assert_eq!(
848            group.node_ids.len(),
849            3,
850            "displayed node_ids must be capped at max_members_per_group"
851        );
852    }
853
854    /// `max_members_per_group = 0` disables the cap — all members are returned
855    /// and `members_truncated` is `false`.
856    #[test]
857    fn test_per_group_cap_zero_means_unlimited() {
858        let (graph, _) = create_large_group_graph(10);
859        let config = DuplicateConfig {
860            max_members_per_group: 0,
861            ..Default::default()
862        };
863
864        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
865        assert_eq!(groups.len(), 1);
866
867        let group = &groups[0];
868        assert_eq!(group.total_members, 10);
869        assert!(
870            !group.members_truncated,
871            "members_truncated must be false when unlimited"
872        );
873        assert_eq!(
874            group.node_ids.len(),
875            10,
876            "All members must be returned when cap is 0"
877        );
878    }
879
880    /// Group with fewer members than the cap — no truncation, `members_truncated`
881    /// remains `false`, and `total_members == node_ids.len()`.
882    #[test]
883    fn test_per_group_cap_no_truncation_when_below_cap() {
884        let (graph, _) = create_large_group_graph(5);
885        let config = DuplicateConfig {
886            max_members_per_group: 10,
887            ..Default::default()
888        };
889
890        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
891        assert_eq!(groups.len(), 1);
892
893        let group = &groups[0];
894        assert_eq!(group.total_members, 5);
895        assert!(
896            !group.members_truncated,
897            "members_truncated must be false when count <= cap"
898        );
899        assert_eq!(
900            group.node_ids.len(),
901            5,
902            "All members must be present when below cap"
903        );
904    }
905
906    /// A group with exactly 1 displayed member after truncation (cap = 1) is
907    /// still returned — single-member *truncated* groups are valid because the
908    /// caller knows there are more members via `total_members`.
909    #[test]
910    fn test_per_group_cap_one_member_displayed() {
911        let (graph, _) = create_large_group_graph(5);
912        let config = DuplicateConfig {
913            max_members_per_group: 1,
914            ..Default::default()
915        };
916
917        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
918        assert_eq!(
919            groups.len(),
920            1,
921            "Group must not be filtered just because only 1 member is displayed"
922        );
923
924        let group = &groups[0];
925        assert_eq!(group.total_members, 5);
926        assert!(group.members_truncated);
927        assert_eq!(group.node_ids.len(), 1);
928    }
929
930    /// Members in each group are sorted deterministically by (file_path, node_name, NodeId).
931    /// Nodes are inserted in reverse-alphabetical order; after sorting the first
932    /// element must be the lexicographically smallest (file, name) pair.
933    #[test]
934    fn test_per_group_deterministic_ordering() {
935        let mut graph = CodeGraph::new();
936        let file_a = graph.files_mut().register(Path::new("src/a.rs")).unwrap();
937        let file_b = graph.files_mut().register(Path::new("src/b.rs")).unwrap();
938        let shared_sig = graph.strings_mut().intern("fn stable_fn() -> i32").unwrap();
939
940        // Insert in "wrong" order (b.rs before a.rs; z before a within a file)
941        // to verify stable sorting regardless of insertion/HashMap order.
942        let names_and_files: &[(&str, _)] = &[
943            ("z_func", file_b),
944            ("m_func", file_b),
945            ("a_func", file_b),
946            ("z_func", file_a),
947            ("a_func", file_a),
948        ];
949
950        let mut inserted: Vec<(String, String, NodeId)> = Vec::new();
951        for (name, file_id) in names_and_files {
952            let name_id = graph.strings_mut().intern(name).unwrap();
953            let entry = NodeEntry::new(NodeKind::Function, name_id, *file_id)
954                .with_qualified_name(name_id)
955                .with_signature(shared_sig);
956            let node_id = graph.nodes_mut().alloc(entry).unwrap();
957            let path = if *file_id == file_a {
958                "src/a.rs".to_owned()
959            } else {
960                "src/b.rs".to_owned()
961            };
962            inserted.push((path, (*name).to_owned(), node_id));
963        }
964
965        // Compute expected order: sort by (file_path, name, NodeId).
966        let mut expected = inserted.clone();
967        expected.sort_by(|(pa, na, ia), (pb, nb, ib)| {
968            pa.cmp(pb).then_with(|| na.cmp(nb)).then_with(|| ia.cmp(ib))
969        });
970        let expected_ids: Vec<NodeId> = expected.iter().map(|(_, _, id)| *id).collect();
971
972        let config = DuplicateConfig {
973            max_members_per_group: 0, // unlimited — don't truncate
974            ..Default::default()
975        };
976        let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
977        assert_eq!(groups.len(), 1, "Expected one duplicate group");
978        assert_eq!(
979            groups[0].node_ids, expected_ids,
980            "node_ids must be in deterministic (file_path, name, NodeId) order"
981        );
982        assert_eq!(groups[0].total_members, 5);
983        assert!(!groups[0].members_truncated);
984    }
985}