1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
// orchestrate.rs
use crate::statistics;
use crate::walker::{WalkConfig, walk};
use cpd_core::detect::{PreparedSource, detect_prepared};
use cpd_core::models::{CpdClone, SourceFile, Statistics};
use cpd_tokenizer::tokenizer::{
Mode, TokenizeOptions, code_ignore_ranges, tokenize_to_detection, tokenize_to_detection_maps,
};
use std::path::PathBuf;
/// Full run configuration.
#[derive(Debug, Clone)]
pub struct RunConfig {
pub paths: Vec<PathBuf>,
pub min_tokens: usize,
pub min_lines: usize,
pub max_lines: Option<usize>,
pub mode: Mode,
pub formats: Vec<String>,
pub ignore: Vec<String>,
pub code_ignore_patterns: Vec<String>,
pub max_size: Option<u64>,
pub no_gitignore: bool,
pub follow_symlinks: bool,
pub skip_local: bool,
pub blame: bool,
pub workers: Option<usize>,
pub ignore_case: bool,
pub formats_exts: std::collections::HashMap<String, Vec<String>>,
pub formats_names: std::collections::HashMap<String, Vec<String>>,
pub pattern: Option<String>,
}
impl Default for RunConfig {
fn default() -> Self {
Self {
paths: vec![],
min_tokens: 50,
min_lines: 5,
max_lines: None,
mode: Mode::Mild,
formats: vec![],
ignore: vec![],
code_ignore_patterns: vec![],
max_size: None,
no_gitignore: false,
follow_symlinks: false,
skip_local: false,
blame: false,
workers: None,
ignore_case: false,
formats_exts: std::collections::HashMap::new(),
formats_names: std::collections::HashMap::new(),
pattern: None,
}
}
}
/// Result of a full run.
pub struct RunResult {
pub clones: Vec<CpdClone>,
pub statistics: Statistics,
pub sources: Vec<SourceFile>,
}
#[derive(Debug)]
pub enum FinderError {
Io(std::io::Error),
Other(String),
}
impl std::fmt::Display for FinderError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Io(e) => write!(f, "I/O error: {e}"),
Self::Other(s) => write!(f, "Error: {s}"),
}
}
}
impl std::error::Error for FinderError {}
impl From<std::io::Error> for FinderError {
fn from(e: std::io::Error) -> Self {
Self::Io(e)
}
}
/// Run the full detection pipeline.
pub fn run(config: &RunConfig) -> Result<RunResult, FinderError> {
// Build a thread pool with a large stack to survive OXC parsing of deeply-nested
// JS/TS files (e.g., thousands of chained for-loops with no body). OXC's
// recursive-descent parser allocates one frame per nesting level; the default
// 8 MiB thread stack is insufficient for pathological inputs like Bun's
// `lots-of-for-loop.js`. 64 MiB gives ample headroom while remaining reasonable.
// A local pool (not build_global) avoids poisoning any caller-owned global pool
// and can be created unconditionally on every run() call.
let pool = {
let mut builder =
rayon::ThreadPoolBuilder::new().stack_size(64 * 1024 * 1024 /* 64 MiB */);
if let Some(n) = config.workers {
builder = builder.num_threads(n);
}
builder
.build()
.unwrap_or_else(|_| rayon::ThreadPoolBuilder::new().build().expect("rayon pool"))
};
// 1. Walk files
let walk_config = WalkConfig {
paths: config.paths.clone(),
extensions: config.formats.clone(),
ignore_patterns: config.ignore.clone(),
max_size: config.max_size,
follow_symlinks: config.follow_symlinks,
no_gitignore: config.no_gitignore,
formats_exts: config.formats_exts.clone(),
formats_names: config.formats_names.clone(),
pattern: config.pattern.clone(),
};
let discovered = walk(&walk_config);
// 2. Read + tokenize files in parallel.
// - Display path: produce Vec<Token> for SourceFile (used by reporters).
// - Detection path: produce Vec<DetectionToken> via tokenize_to_detection
// (filtered + hashed at tokenize time, never stored in SourceFile).
// - Multi-format files (markdown) produce multiple TokenMaps, one per
// embedded sub-language, so embedded code blocks join the correct pool.
use rayon::prelude::*;
let mode = config.mode;
let min_tokens = config.min_tokens;
let min_lines = config.min_lines;
let max_lines = config.max_lines;
let skip_local = config.skip_local;
let ignore_case = config.ignore_case;
// Pre-compile code-level ignore regex patterns once for all threads.
// Invalid patterns are silently skipped.
let code_ignore_regexes: Vec<regex::Regex> = config
.code_ignore_patterns
.iter()
.filter_map(|p| regex::Regex::new(p).ok())
.collect();
const MULTI_FORMAT_EXTS: &[&str] = &["md", "markdown", "mkd", "vue", "svelte", "astro"];
fn is_multi_format(format: &str) -> bool {
MULTI_FORMAT_EXTS.contains(&format)
}
let results: Vec<(Vec<SourceFile>, Vec<PreparedSource>)> = pool.install(|| {
discovered
.into_par_iter()
.filter_map(|file| {
// Open and memory-map the file inside the worker. By NOT
// storing the Mmap in DiscoveredFile we cap concurrent
// mappings to the rayon thread-pool size, which is always
// far below vm.max_map_count (default 131 072 on Linux).
// This also avoids the Vec<u8> allocation that a to_vec()
// copy would require, matching the allocation profile of the
// original mmap approach.
let f = std::fs::File::open(&file.path).ok()?;
let map = unsafe { memmap2::Mmap::map(&f) }.ok()?;
// Line-count filter — fast O(n) pass before UTF-8 decode.
if min_lines > 0 || max_lines.is_some() {
let newlines = memchr::Memchr::new(b'\n', &map).count();
let lc = if !map.is_empty() && *map.last().unwrap() != b'\n' {
newlines + 1
} else {
newlines
};
if lc < min_lines {
return None;
}
if max_lines.is_some_and(|m| lc > m) {
return None;
}
}
let content = str::from_utf8(&map).ok()?;
let id = file.path.to_string_lossy().into_owned();
// Compute code-level ignore ranges from regex matches against source text.
// This matches v4 semantics: regex patterns are matched against source
// text, and any token overlapping a match range is skipped during detection.
let code_ranges = if code_ignore_regexes.is_empty() {
Vec::new()
} else {
code_ignore_ranges(content, &code_ignore_regexes)
};
if is_multi_format(&file.format) {
// Multi-format path: produce one PreparedSource per sub-format.
let opts = TokenizeOptions {
mode,
ignore_case,
ignore_ranges: code_ranges,
code_ignore_regexes: code_ignore_regexes.clone(),
};
let maps = tokenize_to_detection_maps(&file.format, content, &opts);
// Display path: flat tokenize for the parent SourceFile.
let tokens = cpd_tokenizer::tokenizer::tokenize(&file.format, content, mode);
if tokens.len() < min_tokens {
return None;
}
let mut source_files = vec![SourceFile {
id: id.clone(),
format: file.format.clone(),
tokens,
}];
let mut prepared = Vec::new();
for map in maps {
if map.tokens.len() < min_tokens {
continue;
}
let map_id = format!("{}:{}", &id, &map.format);
// For sub-formats, create a synthetic SourceFile with detection
// tokens converted to display tokens so statistics per-format
// counts are correct.
if map.format != file.format {
let synth_tokens: Vec<cpd_core::models::Token> = map
.tokens
.iter()
.map(|dt| cpd_core::models::Token {
kind: cpd_core::models::TokenKind::Other,
value: String::new(),
start: dt.start.clone(),
end: dt.end.clone(),
})
.collect();
source_files.push(SourceFile {
id: map_id.clone(),
format: map.format.clone(),
tokens: synth_tokens,
});
}
prepared.push(PreparedSource::from_detection_tokens(
map_id,
map.format,
&map.tokens,
));
}
if prepared.is_empty() {
return None;
}
Some((source_files, prepared))
} else {
// Single-format path.
let tokens = cpd_tokenizer::tokenizer::tokenize(&file.format, content, mode);
if tokens.len() < min_tokens {
return None;
}
let source_file = SourceFile {
id: id.clone(),
format: file.format.clone(),
tokens,
};
let opts = TokenizeOptions {
mode,
ignore_case,
ignore_ranges: code_ranges,
code_ignore_regexes: code_ignore_regexes.clone(),
};
let det_tokens = tokenize_to_detection(&file.format, content, &opts);
if det_tokens.len() < min_tokens {
return None;
}
let prepared =
PreparedSource::from_detection_tokens(id, file.format, &det_tokens);
Some((vec![source_file], vec![prepared]))
}
})
.collect()
});
let (source_files, mut prepared_sources): (Vec<SourceFile>, Vec<PreparedSource>) =
results.into_iter().fold(
(Vec::new(), Vec::new()),
|(mut ss, mut ps), (more_s, more_p)| {
ss.extend(more_s);
ps.extend(more_p);
(ss, ps)
},
);
// 3. Group prepared sources by format (deterministic order).
prepared_sources.sort_unstable_by(|a, b| a.format.cmp(&b.format).then(a.id.cmp(&b.id)));
let mut format_map: std::collections::HashMap<String, Vec<PreparedSource>> =
std::collections::HashMap::default();
for ps in prepared_sources {
format_map.entry(ps.format.clone()).or_default().push(ps);
}
let mut format_groups: Vec<Vec<PreparedSource>> = format_map.into_values().collect();
// Sort groups by format name for determinism.
format_groups.sort_by(|a, b| a[0].format.cmp(&b[0].format));
// 4. Detect clones — skip_local is now handled inside flush_clone.
let clones =
pool.install(|| detect_prepared(format_groups, min_tokens, skip_local, config.min_lines));
// 5. Compute statistics.
let statistics = statistics::compute(&source_files, &clones);
Ok(RunResult {
clones,
statistics,
sources: source_files,
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn empty_paths_returns_empty_result() {
let config = RunConfig::default();
let result = run(&config).unwrap();
assert!(result.clones.is_empty());
assert_eq!(result.statistics.total.sources, 0);
}
#[test]
fn nonexistent_path_returns_empty() {
let config = RunConfig {
paths: vec![PathBuf::from("/tmp/cpd-nonexistent-xyz")],
..Default::default()
};
let result = run(&config).unwrap();
assert!(result.clones.is_empty());
}
#[test]
fn workers_1_produces_same_result_as_default() {
let fixtures = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/walker");
if !fixtures.exists() {
return;
}
let config_default = RunConfig {
paths: vec![fixtures.clone()],
min_tokens: 3,
..Default::default()
};
let config_single = RunConfig {
paths: vec![fixtures],
min_tokens: 3,
workers: Some(1),
..Default::default()
};
let r1 = run(&config_default).unwrap();
let r2 = run(&config_single).unwrap();
assert_eq!(
r1.sources.len(),
r2.sources.len(),
"--workers 1 must produce same source count as default"
);
}
}