1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
use clap::{Args, Parser, Subcommand};
use std::path::PathBuf;
#[derive(Parser)]
#[command(
name = "dupehound",
version,
about = "Sniffs out near-duplicate code. Fast, offline, no AI required.",
long_about = "dupehound finds near-duplicate functions across your codebase — even when \
identifiers and literals were renamed. It fingerprints normalized syntax using the winnowing \
algorithm (Schleimer, Wilkerson & Aiken, SIGMOD 2003) and never sends code anywhere."
)]
pub struct Cli {
#[command(subcommand)]
pub command: Command,
}
#[derive(Subcommand)]
pub enum Command {
/// Scan a directory for duplicate functions and compute the slop score
Scan(ScanArgs),
/// Chart duplication over git history and find the inflection point
History(HistoryArgs),
/// CI gate: fail when newly added code duplicates existing code
Check(CheckArgs),
/// Run as an MCP server over stdio, exposing check and scan as tools an
/// AI coding agent can call in its loop
Mcp,
}
#[derive(Args)]
pub struct CommonArgs {
/// Minimum similarity (0.0-1.0) for two functions to count as duplicates
#[arg(long)]
pub threshold: Option<f64>,
/// Ignore functions with fewer normalized tokens than this
#[arg(long, default_value_t = 40)]
pub min_tokens: usize,
/// Extra glob patterns to exclude (repeatable)
#[arg(long = "exclude", value_name = "GLOB")]
pub excludes: Vec<String>,
/// Don't apply the built-in exclusions (vendor/, dist/, generated files, ...)
#[arg(long)]
pub no_default_excludes: bool,
/// Include test files in the slop score (they are excluded by default)
#[arg(long)]
pub include_tests: bool,
/// Skip test files entirely (default: scanned but excluded from the score)
#[arg(long, conflicts_with = "include_tests")]
pub exclude_tests: bool,
/// Emit machine-readable JSON instead of the terminal report
#[arg(long)]
pub json: bool,
}
#[derive(Args)]
pub struct ScanArgs {
/// Directory to scan
#[arg(default_value = ".")]
pub path: PathBuf,
#[command(flatten)]
pub common: CommonArgs,
/// Show every cluster instead of the top 10
#[arg(long)]
pub all: bool,
/// Diff the copies of cluster N against the representative
#[arg(long, value_name = "CLUSTER")]
pub explain: Option<usize>,
/// With --explain, show full function bodies instead of a diff
#[arg(long, requires = "explain")]
pub full: bool,
/// Also write a shareable score card (dupehound-card.svg/.png)
#[arg(long)]
pub card: bool,
/// Experimental: also report C# classes whose property/method signatures
/// are near-duplicates (separate from the function clusters and the score)
#[arg(long)]
pub include_classes: bool,
}
#[derive(Args)]
pub struct HistoryArgs {
/// Git repository to analyze
#[arg(default_value = ".")]
pub path: PathBuf,
#[command(flatten)]
pub common: CommonArgs,
/// Maximum number of historical snapshots to measure
#[arg(long, default_value_t = 36)]
pub max_snapshots: usize,
/// Skip writing the shareable card
#[arg(long)]
pub no_card: bool,
}
#[derive(Args)]
pub struct CheckArgs {
/// Git repository to check
#[arg(default_value = ".")]
pub path: PathBuf,
#[command(flatten)]
pub common: CommonArgs,
/// Compare against the merge-base with this revision (PR semantics)
#[arg(long, value_name = "REV")]
pub diff: Option<String>,
}