1use std::collections::HashSet;
2use std::ffi::OsString;
3use std::path::PathBuf;
4
5use anyhow::{Context, Result, bail};
6use clap::Parser;
7use regex::Regex;
8use time::OffsetDateTime;
9use time::format_description::well_known::Rfc3339;
10
11use crate::files::collect_cwd_gitignore_patterns;
12
13mod config;
14mod parsing;
15#[cfg(test)]
16mod tests;
17
18#[cfg(test)]
19use config::{FileConfig, resolve_config_ignore};
20use config::{apply_config, read_config, read_package_json_config};
21#[cfg(test)]
22use parsing::parse_format_mappings;
23use parsing::{
24 compile_patterns, parse_format_mappings_like_upstream, parse_js_number, parse_js_usize,
25 parse_size, split_csv,
26};
27
28const BARE_EXIT_CODE_VALUE: &str = "__jscpd_rs_bare_exit_code_true__";
29const BARE_CONFIG_VALUE: &str = "__jscpd_rs_bare_config_true__";
30const BARE_STRING_VALUE: &str = "__jscpd_rs_bare_string_true__";
31
32#[derive(Debug, Parser)]
33#[command(
34 name = "jscpd",
35 version,
36 about = "detector of copy/paste in files",
37 override_usage = "jscpd [options] <path ...>",
38 disable_version_flag = true,
39 args_override_self = true
40)]
41pub struct Cli {
42 #[arg(short = 'V', long = "version", help = "output the version number")]
43 pub version: bool,
44
45 #[arg(value_name = "path", hide = true)]
46 pub paths: Vec<PathBuf>,
47
48 #[arg(
49 short = 'l',
50 long = "min-lines",
51 value_name = "number",
52 num_args = 0..=1,
53 default_missing_value = "0",
54 value_parser = parse_js_usize,
55 help = "min size of duplication in code lines (Default is 5)"
56 )]
57 pub min_lines: Option<usize>,
58
59 #[arg(
60 short = 'k',
61 long = "min-tokens",
62 value_name = "number",
63 num_args = 0..=1,
64 default_missing_value = "50",
65 value_parser = parse_js_usize,
66 help = "min size of duplication in code tokens (Default is 50)"
67 )]
68 pub min_tokens: Option<usize>,
69
70 #[arg(
71 short = 'x',
72 long = "max-lines",
73 value_name = "number",
74 num_args = 0..=1,
75 default_missing_value = "18446744073709551615",
76 value_parser = parse_js_usize,
77 help = "max size of source in lines (Default is 1000)"
78 )]
79 pub max_lines: Option<usize>,
80
81 #[arg(
82 short = 'z',
83 long = "max-size",
84 value_name = "string",
85 num_args = 0..=1,
86 default_missing_value = "true",
87 help = "max size of source in bytes, examples: 1kb, 1mb, 120kb (Default is 100kb)"
88 )]
89 pub max_size: Option<String>,
90
91 #[arg(
92 short = 't',
93 long = "threshold",
94 value_name = "number",
95 num_args = 0..=1,
96 default_missing_value = "1",
97 value_parser = parse_js_number,
98 help = "threshold for duplication, in case duplications >= threshold jscpd will exit with error"
99 )]
100 pub threshold: Option<f64>,
101
102 #[arg(
103 short = 'c',
104 long = "config",
105 value_name = "string",
106 num_args = 0..=1,
107 default_missing_value = BARE_CONFIG_VALUE,
108 help = "path to config file (Default is .jscpd.json in <path>)"
109 )]
110 pub config: Option<PathBuf>,
111
112 #[arg(
113 short = 'i',
114 long = "ignore",
115 value_name = "string",
116 num_args = 0..=1,
117 default_missing_value = BARE_STRING_VALUE,
118 help = "glob pattern for files what should be excluded from duplication detection"
119 )]
120 pub ignore: Option<String>,
121
122 #[arg(
123 short = 'r',
124 long = "reporters",
125 value_name = "string",
126 num_args = 0..=1,
127 default_missing_value = BARE_STRING_VALUE,
128 help = "reporters or list of reporters separated with comma to use (Default is time,console)"
129 )]
130 pub reporters: Option<String>,
131
132 #[arg(
133 short = 'o',
134 long = "output",
135 value_name = "string",
136 num_args = 0..=1,
137 default_missing_value = BARE_STRING_VALUE,
138 help = "reporters to use (Default is ./report/)"
139 )]
140 pub output: Option<String>,
141
142 #[arg(
143 short = 'm',
144 long = "mode",
145 value_name = "string",
146 num_args = 0..=1,
147 default_missing_value = BARE_STRING_VALUE,
148 help = "mode of quality of search, can be \"strict\", \"mild\" and \"weak\""
149 )]
150 pub mode: Option<String>,
151
152 #[arg(
153 short = 'f',
154 long = "format",
155 value_name = "string",
156 num_args = 0..=1,
157 default_missing_value = BARE_STRING_VALUE,
158 help = "format or formats separated by comma (Example php,javascript,python)"
159 )]
160 pub format: Option<String>,
161
162 #[arg(
163 short = 'p',
164 long = "pattern",
165 value_name = "string",
166 num_args = 0..=1,
167 default_missing_value = "true",
168 help = "glob pattern to file search (Example **/*.txt)"
169 )]
170 pub pattern: Option<String>,
171
172 #[arg(
173 short = 'b',
174 long = "blame",
175 help = "blame authors of duplications (get information about authors from git)"
176 )]
177 pub blame: bool,
178
179 #[arg(
180 short = 's',
181 long = "silent",
182 help = "do not write detection progress and result to a console"
183 )]
184 pub silent: bool,
185
186 #[arg(
187 long = "store",
188 value_name = "string",
189 num_args = 0..=1,
190 default_missing_value = "true",
191 help = "use for define custom store (e.g. --store leveldb used for big codebase)"
192 )]
193 pub store: Option<String>,
194
195 #[arg(
196 long = "store-path",
197 value_name = "string",
198 num_args = 0..=1,
199 default_missing_value = "true",
200 help = "directory to use for store cache (e.g. --store-path /tmp/jscpd-cache, useful when running multiple instances in parallel)"
201 )]
202 pub store_path: Option<PathBuf>,
203
204 #[arg(short = 'a', long = "absolute", help = "use absolute path in reports")]
205 pub absolute: bool,
206
207 #[arg(
208 short = 'n',
209 long = "noSymlinks",
210 help = "dont use symlinks for detection in files"
211 )]
212 pub no_symlinks: bool,
213
214 #[arg(
215 long = "ignoreCase",
216 help = "ignore case of symbols in code (experimental)"
217 )]
218 pub ignore_case: bool,
219
220 #[arg(
221 short = 'g',
222 long = "gitignore",
223 help = "respect .gitignore files (default: enabled, use --no-gitignore to disable)"
224 )]
225 pub gitignore: bool,
226
227 #[arg(long = "no-gitignore", help = "do not respect .gitignore files")]
228 pub no_gitignore: bool,
229
230 #[arg(
231 short = 'd',
232 long = "debug",
233 help = "show debug information, not run detection process(options list and selected files)"
234 )]
235 pub debug: bool,
236
237 #[arg(
238 short = 'v',
239 long = "verbose",
240 help = "show full information during detection process"
241 )]
242 pub verbose: bool,
243
244 #[arg(long = "list", help = "show list of total supported formats")]
245 pub list: bool,
246
247 #[arg(
248 long = "skipLocal",
249 help = "skip duplicates in local folders, just detect cross folders duplications"
250 )]
251 pub skip_local: bool,
252
253 #[arg(
254 long = "exitCode",
255 value_name = "number",
256 num_args = 0..=1,
257 default_missing_value = "__jscpd_rs_bare_exit_code_true__",
258 help = "exit code to use when code duplications are detected"
259 )]
260 pub exit_code: Option<String>,
261
262 #[arg(
263 long = "noTips",
264 help = "do not print tips and promotional messages after detection"
265 )]
266 pub no_tips: bool,
267
268 #[arg(
269 long = "skipComments",
270 help = "ignore comments during detection (alias for --mode weak)"
271 )]
272 pub skip_comments: bool,
273
274 #[arg(
275 long = "ignore-pattern",
276 value_name = "string",
277 num_args = 0..=1,
278 default_missing_value = BARE_STRING_VALUE,
279 help = "Ignore code blocks matching the regexp patterns"
280 )]
281 pub ignore_pattern: Option<String>,
282
283 #[arg(
284 long = "formats-exts",
285 value_name = "string",
286 num_args = 0..=1,
287 default_missing_value = BARE_STRING_VALUE,
288 help = "list of formats with file extensions (javascript:es,es6;dart:dt)"
289 )]
290 pub formats_exts: Option<String>,
291
292 #[arg(
293 long = "formats-names",
294 value_name = "string",
295 num_args = 0..=1,
296 default_missing_value = BARE_STRING_VALUE,
297 help = "list of formats with specific filenames (makefile:Makefile,GNUmakefile;docker:Dockerfile)"
298 )]
299 pub formats_names: Option<String>,
300}
301
302#[derive(Clone, Copy, Debug, PartialEq, Eq)]
303pub enum Mode {
304 Strict,
305 Mild,
306 Weak,
307}
308
309#[derive(Clone, Debug, PartialEq)]
310pub enum ExitCode {
311 Number(f64),
312 String(String),
313 Boolean(bool),
314}
315
316impl ExitCode {
317 fn from_cli(value: String) -> Self {
318 if value == BARE_EXIT_CODE_VALUE {
319 Self::Boolean(true)
320 } else {
321 Self::String(value)
322 }
323 }
324}
325
326#[derive(Debug, Clone)]
333pub struct Options {
334 pub execution_id: Option<String>,
335 pub config: Option<PathBuf>,
336 pub paths: Vec<PathBuf>,
337 pub pattern: String,
338 pub ignore: Vec<String>,
339 pub reporters: Vec<String>,
340 pub listeners: Vec<String>,
341 pub reporters_options: serde_json::Map<String, serde_json::Value>,
342 pub output: PathBuf,
343 pub output_is_bare: bool,
344 pub formats: Option<HashSet<String>>,
345 pub format_order: Option<Vec<String>>,
346 pub formats_exts: FormatMappings,
347 pub formats_names: FormatMappings,
348 pub ignore_pattern: Vec<Regex>,
349 pub min_lines: usize,
350 pub min_tokens: usize,
351 pub max_lines: usize,
352 pub max_size_bytes: u64,
353 pub threshold: Option<f64>,
354 pub mode: Mode,
355 pub store: Option<String>,
356 pub store_path: Option<PathBuf>,
357 pub blame: bool,
358 pub cache: bool,
359 pub silent: bool,
360 pub absolute: bool,
361 pub no_symlinks: bool,
362 pub ignore_case: bool,
363 pub gitignore: bool,
364 pub debug: bool,
365 pub verbose: bool,
366 pub skip_local: bool,
367 pub exit_code: ExitCode,
368 pub no_tips: bool,
369 pub tokens_to_skip: Vec<String>,
370}
371
372#[derive(Clone, Debug, Default, PartialEq, Eq)]
377pub struct FormatMappings(Vec<(String, Vec<String>)>);
378
379impl FormatMappings {
380 pub fn from_pairs<I, S, V, T>(pairs: I) -> Self
382 where
383 I: IntoIterator<Item = (S, V)>,
384 S: Into<String>,
385 V: IntoIterator<Item = T>,
386 T: Into<String>,
387 {
388 Self(
389 pairs
390 .into_iter()
391 .map(|(format, values)| {
392 (format.into(), values.into_iter().map(Into::into).collect())
393 })
394 .collect(),
395 )
396 }
397
398 pub fn is_empty(&self) -> bool {
400 self.0.is_empty()
401 }
402
403 pub fn iter(&self) -> impl Iterator<Item = (&String, &Vec<String>)> {
405 self.0.iter().map(|(format, values)| (format, values))
406 }
407
408 pub fn find_format_for_value(&self, value: &str) -> Option<&str> {
410 self.0.iter().find_map(|(format, values)| {
411 values
412 .iter()
413 .any(|item| item == value)
414 .then_some(format.as_str())
415 })
416 }
417}
418
419impl Default for Options {
420 fn default() -> Self {
421 Self {
422 execution_id: Some(default_execution_id()),
423 config: None,
424 paths: vec![std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."))],
425 pattern: "**/*".to_string(),
426 ignore: Vec::new(),
427 reporters: vec!["console".to_string()],
428 listeners: Vec::new(),
429 reporters_options: serde_json::Map::new(),
430 output: PathBuf::from("./report"),
431 output_is_bare: false,
432 formats: None,
433 format_order: None,
434 formats_exts: FormatMappings::default(),
435 formats_names: FormatMappings::default(),
436 ignore_pattern: Vec::new(),
437 min_lines: 5,
438 min_tokens: 50,
439 max_lines: 1000,
440 max_size_bytes: 100 * 1024,
441 threshold: None,
442 mode: Mode::Mild,
443 store: None,
444 store_path: None,
445 blame: false,
446 cache: true,
447 silent: false,
448 absolute: false,
449 no_symlinks: false,
450 ignore_case: false,
451 gitignore: true,
452 debug: false,
453 verbose: false,
454 skip_local: false,
455 exit_code: ExitCode::Number(0.0),
456 no_tips: std::env::var_os("CI").is_some(),
457 tokens_to_skip: Vec::new(),
458 }
459 }
460}
461
462fn default_execution_id() -> String {
463 OffsetDateTime::now_utc()
464 .format(&Rfc3339)
465 .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
466}
467
468impl Options {
469 pub fn from_args<I, T>(args: I) -> Result<Self>
470 where
471 I: IntoIterator<Item = T>,
472 T: Into<OsString> + Clone,
473 {
474 let cli = Cli::try_parse_from(args)?;
475 Self::from_cli(cli)
476 }
477
478 pub fn from_cli(cli: Cli) -> Result<Self> {
479 let mut options = Self::default();
480
481 if matches!(cli.config.as_deref(), Some(path) if path == std::path::Path::new(BARE_CONFIG_VALUE))
482 {
483 bail!(
484 "TypeError [ERR_INVALID_ARG_TYPE]: The \"paths[0]\" argument must be of type string. Received type boolean (true)"
485 );
486 }
487
488 if let Some((config, config_dir, config_path)) = read_package_json_config()? {
489 options.config = Some(config_path);
490 apply_config(&mut options, config, &config_dir)?;
491 }
492 if let Some((config, config_dir, config_path)) = read_config(cli.config.as_deref())? {
493 options.config = Some(config_path);
494 apply_config(&mut options, config, &config_dir)?;
495 }
496
497 if !cli.paths.is_empty() {
498 options.paths = cli.paths;
499 }
500 if let Some(pattern) = cli.pattern {
501 options.pattern = pattern;
502 }
503 if let Some(ignore) = cli.ignore {
504 if is_bare_string(&ignore) {
505 bail!("TypeError: cli.ignore.split is not a function");
506 }
507 options.ignore = split_csv(&ignore);
508 }
509 if let Some(reporters) = cli.reporters {
510 if is_bare_string(&reporters) {
511 bail!("TypeError: cli.reporters.split is not a function");
512 }
513 options.reporters = split_csv(&reporters);
514 }
515 if let Some(output) = cli.output {
516 if is_bare_string(&output) {
517 options.output = PathBuf::from("true");
518 options.output_is_bare = true;
519 } else {
520 options.output = PathBuf::from(output);
521 options.output_is_bare = false;
522 }
523 }
524 if let Some(format) = cli.format {
525 if is_bare_string(&format) {
526 bail!("TypeError: cli.format.split is not a function");
527 }
528 let formats = split_csv(&format);
529 options.formats = Some(formats.iter().cloned().collect());
530 options.format_order = Some(formats);
531 }
532 if let Some(formats_exts) = cli.formats_exts {
533 if is_bare_string(&formats_exts) {
534 bail!("TypeError: extensions.split is not a function");
535 }
536 options.formats_exts = parse_format_mappings_like_upstream(&formats_exts)?;
537 }
538 if let Some(formats_names) = cli.formats_names {
539 if is_bare_string(&formats_names) {
540 bail!("TypeError: extensions.split is not a function");
541 }
542 options.formats_names = parse_format_mappings_like_upstream(&formats_names)?;
543 }
544 if let Some(ignore_pattern) = cli.ignore_pattern {
545 if is_bare_string(&ignore_pattern) {
546 bail!("TypeError: cli.ignorePattern.split is not a function");
547 }
548 options.ignore_pattern = compile_patterns(split_csv(&ignore_pattern))
549 .context("invalid --ignore-pattern value")?;
550 }
551 if let Some(min_lines) = cli.min_lines {
552 options.min_lines = min_lines;
553 }
554 if let Some(min_tokens) = cli.min_tokens {
555 options.min_tokens = min_tokens;
556 }
557 if let Some(max_lines) = cli.max_lines {
558 options.max_lines = max_lines;
559 }
560 if let Some(max_size) = cli.max_size {
561 options.max_size_bytes = parse_size(&max_size)
562 .with_context(|| format!("invalid --max-size value `{max_size}`"))?;
563 }
564 if let Some(threshold) = cli.threshold {
565 options.threshold = Some(threshold);
566 }
567 if let Some(mode) = cli.mode.as_deref() {
568 if is_bare_string(mode) {
569 bail!("TypeError: mode is not a function");
570 }
571 options.mode = parse_mode(mode)?;
572 }
573 if cli.skip_comments && cli.mode.is_none() {
574 options.mode = Mode::Weak;
575 }
576 if let Some(store) = cli.store {
577 options.store = Some(store);
578 }
579 if let Some(store_path) = cli.store_path {
580 options.store_path = Some(store_path);
581 }
582 if cli.blame {
583 options.blame = true;
584 }
585 if cli.silent {
586 options.silent = true;
587 }
588 if cli.absolute {
589 options.absolute = true;
590 }
591 if cli.no_symlinks {
592 options.no_symlinks = true;
593 }
594 if cli.ignore_case {
595 options.ignore_case = true;
596 }
597 if cli.no_gitignore {
598 options.gitignore = false;
599 } else if cli.gitignore {
600 options.gitignore = true;
601 }
602 if cli.debug {
603 options.debug = true;
604 }
605 if cli.verbose {
606 options.verbose = true;
607 }
608 if cli.skip_local {
609 options.skip_local = true;
610 }
611 if let Some(exit_code) = cli.exit_code {
612 options.exit_code = ExitCode::from_cli(exit_code);
613 }
614 if cli.no_tips {
615 options.no_tips = true;
616 }
617
618 apply_cwd_gitignore_patterns(&mut options)?;
619 normalize_reporters(&mut options);
620
621 Ok(options)
622 }
623}
624
625fn is_bare_string(value: &str) -> bool {
626 value == BARE_STRING_VALUE
627}
628
629pub fn resolve_node_exit_code(exit_code: &ExitCode) -> std::result::Result<i32, String> {
630 parsing::node_exit_code(exit_code).map_err(|error| error.message())
631}
632
633pub fn store_warning(options: &Options) -> Option<String> {
634 options
635 .store
636 .as_ref()
637 .map(|store| format!("store name {store} not installed."))
638}
639
640pub(super) fn parse_mode(value: &str) -> Result<Mode> {
641 match value {
642 "strict" => Ok(Mode::Strict),
643 "mild" => Ok(Mode::Mild),
644 "weak" => Ok(Mode::Weak),
645 _ => bail!("Mode {value} does not supported yet."),
646 }
647}
648
649fn normalize_reporters(options: &mut Options) {
650 if options.silent {
651 options
652 .reporters
653 .retain(|reporter| !reporter.contains("console"));
654 options.reporters.push("silent".to_string());
655 }
656 if options.threshold.is_some() {
657 options.reporters.push("threshold".to_string());
658 }
659}
660
661fn apply_cwd_gitignore_patterns(options: &mut Options) -> Result<()> {
662 let cwd = std::env::current_dir().context("failed to resolve current directory")?;
663 apply_gitignore_patterns_from(options, &cwd);
664 Ok(())
665}
666
667fn apply_gitignore_patterns_from(options: &mut Options, cwd: &std::path::Path) {
668 if options.gitignore {
669 options.ignore.extend(collect_cwd_gitignore_patterns(cwd));
670 }
671}