fallow_core/duplicates/
mod.rs1pub mod detect;
9pub mod normalize;
10pub mod tokenize;
11pub mod types;
12
13use std::path::{Path, PathBuf};
14
15use globset::{Glob, GlobSet, GlobSetBuilder};
16use rayon::prelude::*;
17
18use detect::CloneDetector;
19use normalize::normalize_and_hash;
20use tokenize::tokenize_file;
21pub use types::{
22 CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats,
23};
24
25use crate::discover::{self, DiscoveredFile};
26
27pub fn find_duplicates(
35 root: &Path,
36 files: &[DiscoveredFile],
37 config: &DuplicatesConfig,
38) -> DuplicationReport {
39 let _span = tracing::info_span!("find_duplicates").entered();
40
41 let extra_ignores = build_ignore_set(&config.ignore);
43
44 let file_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> = files
46 .par_iter()
47 .filter_map(|file| {
48 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
50 if let Some(ref ignores) = extra_ignores
51 && ignores.is_match(relative)
52 {
53 return None;
54 }
55
56 let source = std::fs::read_to_string(&file.path).ok()?;
58
59 let file_tokens = tokenize_file(&file.path, &source);
61 if file_tokens.tokens.is_empty() {
62 return None;
63 }
64
65 let hashed = normalize_and_hash(&file_tokens.tokens, config.mode);
67 if hashed.len() < config.min_tokens {
68 return None;
69 }
70
71 Some((file.path.clone(), hashed, file_tokens))
72 })
73 .collect();
74
75 tracing::info!(
76 files = file_data.len(),
77 "tokenized files for duplication analysis"
78 );
79
80 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
82 detector.detect(file_data)
83}
84
85pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
89 let resolved = crate::default_config(root);
90 let files = discover::discover_files(&resolved);
91 find_duplicates(root, &files, config)
92}
93
94fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
96 if patterns.is_empty() {
97 return None;
98 }
99
100 let mut builder = GlobSetBuilder::new();
101 for pattern in patterns {
102 match Glob::new(pattern) {
103 Ok(glob) => {
104 builder.add(glob);
105 }
106 Err(e) => {
107 tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
108 }
109 }
110 }
111
112 builder.build().ok()
113}
114
115#[cfg(test)]
116mod tests {
117 use super::*;
118 use crate::discover::FileId;
119
120 #[test]
121 fn find_duplicates_empty_files() {
122 let config = DuplicatesConfig::default();
123 let report = find_duplicates(Path::new("/tmp"), &[], &config);
124 assert!(report.clone_groups.is_empty());
125 assert_eq!(report.stats.total_files, 0);
126 }
127
128 #[test]
129 fn build_ignore_set_empty() {
130 assert!(build_ignore_set(&[]).is_none());
131 }
132
133 #[test]
134 fn build_ignore_set_valid_patterns() {
135 let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
136 assert!(set.is_some());
137 let set = set.unwrap();
138 assert!(set.is_match("src/foo.test.ts"));
139 assert!(set.is_match("src/bar.spec.ts"));
140 assert!(!set.is_match("src/baz.ts"));
141 }
142
143 #[test]
144 fn find_duplicates_with_real_files() {
145 let dir = tempfile::tempdir().expect("create temp dir");
147 let src_dir = dir.path().join("src");
148 std::fs::create_dir_all(&src_dir).expect("create src dir");
149
150 let code = r#"
151export function processData(input: string): string {
152 const trimmed = input.trim();
153 if (trimmed.length === 0) {
154 return "";
155 }
156 const parts = trimmed.split(",");
157 const filtered = parts.filter(p => p.length > 0);
158 const mapped = filtered.map(p => p.toUpperCase());
159 return mapped.join(", ");
160}
161
162export function validateInput(data: string): boolean {
163 if (data === null || data === undefined) {
164 return false;
165 }
166 const cleaned = data.trim();
167 if (cleaned.length < 3) {
168 return false;
169 }
170 return true;
171}
172"#;
173
174 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
175 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
176 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
177 .expect("write package.json");
178
179 let files = vec![
180 DiscoveredFile {
181 id: FileId(0),
182 path: src_dir.join("original.ts"),
183 size_bytes: code.len() as u64,
184 },
185 DiscoveredFile {
186 id: FileId(1),
187 path: src_dir.join("copy.ts"),
188 size_bytes: code.len() as u64,
189 },
190 ];
191
192 let config = DuplicatesConfig {
193 min_tokens: 10,
194 min_lines: 2,
195 ..DuplicatesConfig::default()
196 };
197
198 let report = find_duplicates(dir.path(), &files, &config);
199 assert!(
200 !report.clone_groups.is_empty(),
201 "Should detect clones in identical files"
202 );
203 assert!(report.stats.files_with_clones >= 2);
204 }
205}