1use crate::graph::body_hash::BodyHash128;
7use crate::graph::unified::concurrent::CodeGraph;
8use crate::graph::unified::node::{NodeId, NodeKind};
9use std::collections::HashMap;
10use std::hash::{Hash, Hasher};
11use std::str::FromStr;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum DuplicateType {
16 Body,
18 Signature,
20 Struct,
22}
23
24impl DuplicateType {
25 #[must_use]
27 pub fn parse(value: &str) -> Option<Self> {
28 value.parse().ok()
29 }
30}
31
32impl FromStr for DuplicateType {
33 type Err = anyhow::Error;
34
35 fn from_str(value: &str) -> Result<Self, Self::Err> {
36 match value.trim().to_lowercase().as_str() {
37 "body" | "function" => Ok(Self::Body),
38 "signature" | "sig" => Ok(Self::Signature),
39 "struct" | "type" => Ok(Self::Struct),
40 _ => Err(anyhow::anyhow!("Unknown duplicate type: {value}")),
41 }
42 }
43}
44
45#[derive(Debug, Clone)]
47pub struct DuplicateConfig {
48 pub threshold: f64,
50 #[allow(dead_code)]
52 pub max_results: usize,
53 pub is_exact_only: bool,
55}
56
57impl Default for DuplicateConfig {
58 fn default() -> Self {
59 Self {
60 threshold: 0.8,
61 max_results: 1000,
62 is_exact_only: false,
63 }
64 }
65}
66
67#[derive(Debug, Clone)]
69pub struct DuplicateGroup {
70 pub hash: u64,
72 pub body_hash_128: Option<BodyHash128>,
77 pub node_ids: Vec<NodeId>,
79}
80
81fn compute_hash(graph: &CodeGraph, node_id: NodeId, dup_type: DuplicateType) -> Option<u64> {
83 let entry = graph.nodes().get(node_id)?;
84 let strings = graph.strings();
85
86 match dup_type {
87 DuplicateType::Body => {
88 if let Some(body_hash) = entry.body_hash {
91 return Some(body_hash.high ^ body_hash.low);
94 }
95
96 if let Some(sig_id) = entry.signature
98 && let Some(sig) = strings.resolve(sig_id)
99 {
100 let mut hasher = std::collections::hash_map::DefaultHasher::new();
101 sig.hash(&mut hasher);
102 entry.kind.hash(&mut hasher);
103 return Some(hasher.finish());
104 }
105
106 let name = entry
108 .qualified_name
109 .and_then(|id| strings.resolve(id))
110 .or_else(|| strings.resolve(entry.name))?;
111
112 let mut hasher = std::collections::hash_map::DefaultHasher::new();
113 name.hash(&mut hasher);
114 entry.kind.hash(&mut hasher);
115 let lines = entry.end_line.saturating_sub(entry.start_line);
117 lines.hash(&mut hasher);
118 Some(hasher.finish())
119 }
120 DuplicateType::Signature => {
121 if let Some(sig_id) = entry.signature
123 && let Some(sig) = strings.resolve(sig_id)
124 {
125 let mut hasher = std::collections::hash_map::DefaultHasher::new();
126 sig.hash(&mut hasher);
127 return Some(hasher.finish());
128 }
129
130 let name = strings.resolve(entry.name)?;
132 let mut hasher = std::collections::hash_map::DefaultHasher::new();
133 name.hash(&mut hasher);
134 entry.kind.hash(&mut hasher);
135 Some(hasher.finish())
136 }
137 DuplicateType::Struct => {
138 if !matches!(entry.kind, NodeKind::Struct | NodeKind::Class) {
140 return None;
141 }
142
143 let name = entry
145 .qualified_name
146 .and_then(|id| strings.resolve(id))
147 .or_else(|| strings.resolve(entry.name))?;
148
149 let mut hasher = std::collections::hash_map::DefaultHasher::new();
150 name.hash(&mut hasher);
151 entry.kind.hash(&mut hasher);
152 Some(hasher.finish())
153 }
154 }
155}
156
157#[must_use]
176pub fn build_duplicate_groups_graph(
177 dup_type: DuplicateType,
178 graph: &CodeGraph,
179 config: &DuplicateConfig,
180) -> Vec<DuplicateGroup> {
181 let mut hash_to_nodes: HashMap<u64, Vec<NodeId>> = HashMap::new();
182
183 let relevant_kinds: Vec<NodeKind> = match dup_type {
185 DuplicateType::Body | DuplicateType::Signature => {
186 vec![NodeKind::Function, NodeKind::Method]
187 }
188 DuplicateType::Struct => vec![NodeKind::Struct, NodeKind::Class],
189 };
190
191 for (node_id, entry) in graph.nodes().iter() {
193 if !relevant_kinds.contains(&entry.kind) {
194 continue;
195 }
196
197 if let Some(hash) = compute_hash(graph, node_id, dup_type) {
198 hash_to_nodes.entry(hash).or_default().push(node_id);
199 }
200 }
201
202 let mut groups: Vec<DuplicateGroup> = hash_to_nodes
204 .into_iter()
205 .filter(|(_, nodes)| {
206 if config.is_exact_only {
207 nodes.len() > 1
208 } else {
209 nodes.len() > 1
212 }
213 })
214 .map(|(hash, node_ids)| {
215 let body_hash_128 = if dup_type == DuplicateType::Body {
217 node_ids
218 .first()
219 .and_then(|&id| graph.nodes().get(id).and_then(|entry| entry.body_hash))
220 } else {
221 None
222 };
223
224 DuplicateGroup {
225 hash,
226 body_hash_128,
227 node_ids,
228 }
229 })
230 .collect();
231
232 groups.sort_by(|a, b| b.node_ids.len().cmp(&a.node_ids.len()));
234
235 groups.truncate(config.max_results);
237
238 groups
239}
240
241#[cfg(test)]
242mod tests {
243 use super::*;
244 use crate::graph::unified::storage::arena::NodeEntry;
245 use std::path::Path;
246
247 fn create_test_graph_with_signatures(
249 nodes: &[(&str, NodeKind, Option<&str>)],
250 ) -> (CodeGraph, Vec<NodeId>) {
251 let mut graph = CodeGraph::new();
252 let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
253 let mut node_ids = Vec::new();
254
255 for (name, kind, signature) in nodes {
256 let name_id = graph.strings_mut().intern(name).unwrap();
257 let mut entry = NodeEntry::new(*kind, name_id, file_id).with_qualified_name(name_id);
258
259 if let Some(sig) = signature {
260 let sig_id = graph.strings_mut().intern(sig).unwrap();
261 entry = entry.with_signature(sig_id);
262 }
263
264 let node_id = graph.nodes_mut().alloc(entry).unwrap();
265 node_ids.push(node_id);
266 }
267
268 (graph, node_ids)
269 }
270
271 fn create_test_graph_with_spans(
273 nodes: &[(&str, NodeKind, u32, u32)], ) -> (CodeGraph, Vec<NodeId>) {
275 let mut graph = CodeGraph::new();
276 let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
277 let mut node_ids = Vec::new();
278
279 for (name, kind, start_line, end_line) in nodes {
280 let name_id = graph.strings_mut().intern(name).unwrap();
281 let entry = NodeEntry::new(*kind, name_id, file_id)
282 .with_qualified_name(name_id)
283 .with_location(*start_line, 0, *end_line, 0);
284
285 let node_id = graph.nodes_mut().alloc(entry).unwrap();
286 node_ids.push(node_id);
287 }
288
289 (graph, node_ids)
290 }
291
292 #[test]
293 fn test_empty_graph() {
294 let graph = CodeGraph::new();
295 let config = DuplicateConfig::default();
296
297 let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
298 assert!(groups.is_empty());
299 }
300
301 #[test]
302 fn test_no_duplicates_unique_signatures() {
303 let nodes = [
305 ("func_a", NodeKind::Function, Some("fn func_a() -> i32")),
306 ("func_b", NodeKind::Function, Some("fn func_b() -> String")),
307 (
308 "func_c",
309 NodeKind::Function,
310 Some("fn func_c(x: i32) -> bool"),
311 ),
312 ];
313 let (graph, _) = create_test_graph_with_signatures(&nodes);
314 let config = DuplicateConfig::default();
315
316 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
317 assert!(groups.is_empty(), "No duplicates should be found");
318 }
319
320 #[test]
321 fn test_signature_duplicates() {
322 let nodes = [
324 (
325 "func_a",
326 NodeKind::Function,
327 Some("fn process(x: i32) -> i32"),
328 ),
329 (
330 "func_b",
331 NodeKind::Function,
332 Some("fn process(x: i32) -> i32"),
333 ),
334 ("func_c", NodeKind::Function, Some("fn other() -> String")),
335 ];
336 let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
337 let config = DuplicateConfig::default();
338
339 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
340 assert_eq!(groups.len(), 1, "Should find one duplicate group");
341 assert_eq!(
342 groups[0].node_ids.len(),
343 2,
344 "Group should have 2 duplicates"
345 );
346 assert!(groups[0].node_ids.contains(&node_ids[0]));
347 assert!(groups[0].node_ids.contains(&node_ids[1]));
348 }
349
350 #[test]
351 fn test_body_duplicates_with_signatures() {
352 let nodes = [
354 (
355 "func_a",
356 NodeKind::Function,
357 Some("fn compute(x: i32) -> i32"),
358 ),
359 (
360 "func_b",
361 NodeKind::Function,
362 Some("fn compute(x: i32) -> i32"),
363 ),
364 (
365 "func_c",
366 NodeKind::Method,
367 Some("fn compute(x: i32) -> i32"),
368 ), ];
370 let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
371 let config = DuplicateConfig::default();
372
373 let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
374 assert_eq!(groups.len(), 1, "Should find one duplicate group");
377 assert_eq!(groups[0].node_ids.len(), 2);
378 assert!(groups[0].node_ids.contains(&node_ids[0]));
379 assert!(groups[0].node_ids.contains(&node_ids[1]));
380 }
381
382 #[test]
383 fn test_body_duplicates_fallback_to_name_and_span() {
384 let nodes = [
386 ("helper", NodeKind::Function, 10, 20), ("helper", NodeKind::Function, 30, 40), ("other", NodeKind::Function, 50, 60), ];
390 let (graph, node_ids) = create_test_graph_with_spans(&nodes);
391 let config = DuplicateConfig::default();
392
393 let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
394 assert_eq!(groups.len(), 1, "Should find one duplicate group");
396 assert!(groups[0].node_ids.contains(&node_ids[0]));
397 assert!(groups[0].node_ids.contains(&node_ids[1]));
398 }
399
400 #[test]
401 fn test_struct_duplicates() {
402 let nodes = [
404 ("MyStruct", NodeKind::Struct, None),
405 ("MyStruct", NodeKind::Struct, None),
406 ("MyStruct", NodeKind::Function, None), ("OtherStruct", NodeKind::Struct, None),
408 ];
409 let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
410 let config = DuplicateConfig::default();
411
412 let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
413 assert_eq!(groups.len(), 1, "Should find one duplicate group");
415 assert!(groups[0].node_ids.contains(&node_ids[0]));
416 assert!(groups[0].node_ids.contains(&node_ids[1]));
417 assert!(!groups[0].node_ids.contains(&node_ids[2])); }
419
420 #[test]
421 fn test_class_duplicates() {
422 let nodes = [
424 ("MyClass", NodeKind::Class, None),
425 ("MyClass", NodeKind::Class, None),
426 ("OtherClass", NodeKind::Class, None),
427 ];
428 let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
429 let config = DuplicateConfig::default();
430
431 let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
432 assert_eq!(groups.len(), 1);
433 assert!(groups[0].node_ids.contains(&node_ids[0]));
434 assert!(groups[0].node_ids.contains(&node_ids[1]));
435 }
436
437 #[test]
438 fn test_methods_ignored_for_struct_duplicates() {
439 let nodes = [
441 ("process", NodeKind::Method, Some("fn process()")),
442 ("process", NodeKind::Method, Some("fn process()")),
443 ];
444 let (graph, _) = create_test_graph_with_signatures(&nodes);
445 let config = DuplicateConfig::default();
446
447 let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
448 assert!(
449 groups.is_empty(),
450 "Methods should not be considered for struct duplicates"
451 );
452 }
453
454 #[test]
455 fn test_max_results_limit() {
456 let mut nodes = Vec::new();
458 for i in 0..10 {
459 let sig = format!("fn group{i}()");
461 nodes.push((format!("func_{i}_a"), NodeKind::Function, Some(sig.clone())));
462 nodes.push((format!("func_{i}_b"), NodeKind::Function, Some(sig)));
463 }
464
465 let nodes_ref: Vec<(&str, NodeKind, Option<&str>)> = nodes
466 .iter()
467 .map(|(name, kind, sig)| (name.as_str(), *kind, sig.as_deref()))
468 .collect();
469 let (graph, _) = create_test_graph_with_signatures(&nodes_ref);
470 let config = DuplicateConfig {
471 max_results: 3,
472 ..Default::default()
473 };
474
475 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
476 assert_eq!(groups.len(), 3, "Should respect max_results limit");
477 }
478
479 #[test]
480 fn test_groups_sorted_by_size() {
481 let nodes = [
483 ("large_a", NodeKind::Function, Some("fn large()")),
485 ("large_b", NodeKind::Function, Some("fn large()")),
486 ("large_c", NodeKind::Function, Some("fn large()")),
487 ("small_a", NodeKind::Function, Some("fn small()")),
489 ("small_b", NodeKind::Function, Some("fn small()")),
490 ];
491 let (graph, _) = create_test_graph_with_signatures(&nodes);
492 let config = DuplicateConfig::default();
493
494 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
495 assert_eq!(groups.len(), 2);
496 assert_eq!(groups[0].node_ids.len(), 3, "Largest group should be first");
497 assert_eq!(
498 groups[1].node_ids.len(),
499 2,
500 "Smaller group should be second"
501 );
502 }
503
504 #[test]
505 fn test_single_node_not_duplicate() {
506 let nodes = [
508 ("unique_func", NodeKind::Function, Some("fn unique()")),
509 ("other_func", NodeKind::Function, Some("fn different()")),
510 ];
511 let (graph, _) = create_test_graph_with_signatures(&nodes);
512 let config = DuplicateConfig::default();
513
514 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
515 assert!(
516 groups.is_empty(),
517 "Single nodes should not form duplicate groups"
518 );
519 }
520
521 #[test]
522 fn test_mixed_kinds_not_duplicates() {
523 let nodes = [
525 ("process", NodeKind::Function, Some("fn process()")),
526 ("process", NodeKind::Method, Some("fn process()")),
527 ];
528 let (graph, _) = create_test_graph_with_signatures(&nodes);
529 let config = DuplicateConfig::default();
530
531 let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
533 assert!(
534 groups.is_empty(),
535 "Different kinds should not be duplicates for body type"
536 );
537
538 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
540 assert_eq!(
541 groups.len(),
542 1,
543 "Same signature should be duplicates regardless of kind"
544 );
545 }
546
547 #[test]
548 fn test_multiple_duplicate_groups() {
549 let nodes = [
551 ("func_a1", NodeKind::Function, Some("fn alpha()")),
552 ("func_a2", NodeKind::Function, Some("fn alpha()")),
553 ("func_b1", NodeKind::Function, Some("fn beta()")),
554 ("func_b2", NodeKind::Function, Some("fn beta()")),
555 ("unique", NodeKind::Function, Some("fn gamma()")),
556 ];
557 let (graph, _) = create_test_graph_with_signatures(&nodes);
558 let config = DuplicateConfig::default();
559
560 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
561 assert_eq!(groups.len(), 2, "Should find two duplicate groups");
562 }
563
564 #[test]
565 fn test_exact_only_config() {
566 let nodes = [
568 ("func_a", NodeKind::Function, Some("fn exact()")),
569 ("func_b", NodeKind::Function, Some("fn exact()")),
570 ];
571 let (graph, _) = create_test_graph_with_signatures(&nodes);
572 let config = DuplicateConfig {
573 is_exact_only: true,
574 ..Default::default()
575 };
576
577 let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
578 assert_eq!(groups.len(), 1);
579 }
580}