Skip to main content

jscpd_rs/
cli.rs

1use std::collections::HashSet;
2use std::ffi::OsString;
3use std::path::PathBuf;
4
5use anyhow::{Context, Result, bail};
6use clap::Parser;
7use regex::Regex;
8use time::OffsetDateTime;
9use time::format_description::well_known::Rfc3339;
10
11use crate::files::collect_cwd_gitignore_patterns;
12
13mod config;
14mod parsing;
15#[cfg(test)]
16mod tests;
17
18#[cfg(test)]
19use config::{FileConfig, resolve_config_ignore};
20use config::{apply_config, read_config, read_package_json_config};
21#[cfg(test)]
22use parsing::parse_format_mappings;
23use parsing::{
24    compile_patterns, parse_format_mappings_like_upstream, parse_js_number, parse_js_usize,
25    parse_size, split_csv,
26};
27
28const BARE_EXIT_CODE_VALUE: &str = "__jscpd_rs_bare_exit_code_true__";
29const BARE_CONFIG_VALUE: &str = "__jscpd_rs_bare_config_true__";
30const BARE_STRING_VALUE: &str = "__jscpd_rs_bare_string_true__";
31
32#[derive(Debug, Parser)]
33#[command(
34    name = "jscpd",
35    version,
36    about = "detector of copy/paste in files",
37    override_usage = "jscpd [options] <path ...>",
38    disable_version_flag = true,
39    args_override_self = true
40)]
41pub struct Cli {
42    #[arg(short = 'V', long = "version", help = "output the version number")]
43    pub version: bool,
44
45    #[arg(value_name = "path", hide = true)]
46    pub paths: Vec<PathBuf>,
47
48    #[arg(
49        short = 'l',
50        long = "min-lines",
51        value_name = "number",
52        num_args = 0..=1,
53        default_missing_value = "0",
54        value_parser = parse_js_usize,
55        help = "min size of duplication in code lines (Default is 5)"
56    )]
57    pub min_lines: Option<usize>,
58
59    #[arg(
60        short = 'k',
61        long = "min-tokens",
62        value_name = "number",
63        num_args = 0..=1,
64        default_missing_value = "50",
65        value_parser = parse_js_usize,
66        help = "min size of duplication in code tokens (Default is 50)"
67    )]
68    pub min_tokens: Option<usize>,
69
70    #[arg(
71        short = 'x',
72        long = "max-lines",
73        value_name = "number",
74        num_args = 0..=1,
75        default_missing_value = "18446744073709551615",
76        value_parser = parse_js_usize,
77        help = "max size of source in lines (Default is 1000)"
78    )]
79    pub max_lines: Option<usize>,
80
81    #[arg(
82        short = 'z',
83        long = "max-size",
84        value_name = "string",
85        num_args = 0..=1,
86        default_missing_value = "true",
87        help = "max size of source in bytes, examples: 1kb, 1mb, 120kb (Default is 100kb)"
88    )]
89    pub max_size: Option<String>,
90
91    #[arg(
92        short = 't',
93        long = "threshold",
94        value_name = "number",
95        num_args = 0..=1,
96        default_missing_value = "1",
97        value_parser = parse_js_number,
98        help = "threshold for duplication, in case duplications >= threshold jscpd will exit with error"
99    )]
100    pub threshold: Option<f64>,
101
102    #[arg(
103        short = 'c',
104        long = "config",
105        value_name = "string",
106        num_args = 0..=1,
107        default_missing_value = BARE_CONFIG_VALUE,
108        help = "path to config file (Default is .jscpd.json in <path>)"
109    )]
110    pub config: Option<PathBuf>,
111
112    #[arg(
113        short = 'i',
114        long = "ignore",
115        value_name = "string",
116        num_args = 0..=1,
117        default_missing_value = BARE_STRING_VALUE,
118        help = "glob pattern for files what should be excluded from duplication detection"
119    )]
120    pub ignore: Option<String>,
121
122    #[arg(
123        short = 'r',
124        long = "reporters",
125        value_name = "string",
126        num_args = 0..=1,
127        default_missing_value = BARE_STRING_VALUE,
128        help = "reporters or list of reporters separated with comma to use (Default is time,console)"
129    )]
130    pub reporters: Option<String>,
131
132    #[arg(
133        short = 'o',
134        long = "output",
135        value_name = "string",
136        num_args = 0..=1,
137        default_missing_value = BARE_STRING_VALUE,
138        help = "reporters to use (Default is ./report/)"
139    )]
140    pub output: Option<String>,
141
142    #[arg(
143        short = 'm',
144        long = "mode",
145        value_name = "string",
146        num_args = 0..=1,
147        default_missing_value = BARE_STRING_VALUE,
148        help = "mode of quality of search, can be \"strict\", \"mild\" and \"weak\""
149    )]
150    pub mode: Option<String>,
151
152    #[arg(
153        short = 'f',
154        long = "format",
155        value_name = "string",
156        num_args = 0..=1,
157        default_missing_value = BARE_STRING_VALUE,
158        help = "format or formats separated by comma (Example php,javascript,python)"
159    )]
160    pub format: Option<String>,
161
162    #[arg(
163        short = 'p',
164        long = "pattern",
165        value_name = "string",
166        num_args = 0..=1,
167        default_missing_value = "true",
168        help = "glob pattern to file search (Example **/*.txt)"
169    )]
170    pub pattern: Option<String>,
171
172    #[arg(
173        short = 'b',
174        long = "blame",
175        help = "blame authors of duplications (get information about authors from git)"
176    )]
177    pub blame: bool,
178
179    #[arg(
180        short = 's',
181        long = "silent",
182        help = "do not write detection progress and result to a console"
183    )]
184    pub silent: bool,
185
186    #[arg(
187        long = "store",
188        value_name = "string",
189        num_args = 0..=1,
190        default_missing_value = "true",
191        help = "use for define custom store (e.g. --store leveldb used for big codebase)"
192    )]
193    pub store: Option<String>,
194
195    #[arg(
196        long = "store-path",
197        value_name = "string",
198        num_args = 0..=1,
199        default_missing_value = "true",
200        help = "directory to use for store cache (e.g. --store-path /tmp/jscpd-cache, useful when running multiple instances in parallel)"
201    )]
202    pub store_path: Option<PathBuf>,
203
204    #[arg(short = 'a', long = "absolute", help = "use absolute path in reports")]
205    pub absolute: bool,
206
207    #[arg(
208        short = 'n',
209        long = "noSymlinks",
210        help = "dont use symlinks for detection in files"
211    )]
212    pub no_symlinks: bool,
213
214    #[arg(
215        long = "ignoreCase",
216        help = "ignore case of symbols in code (experimental)"
217    )]
218    pub ignore_case: bool,
219
220    #[arg(
221        short = 'g',
222        long = "gitignore",
223        help = "respect .gitignore files (default: enabled, use --no-gitignore to disable)"
224    )]
225    pub gitignore: bool,
226
227    #[arg(long = "no-gitignore", help = "do not respect .gitignore files")]
228    pub no_gitignore: bool,
229
230    #[arg(
231        short = 'd',
232        long = "debug",
233        help = "show debug information, not run detection process(options list and selected files)"
234    )]
235    pub debug: bool,
236
237    #[arg(
238        short = 'v',
239        long = "verbose",
240        help = "show full information during detection process"
241    )]
242    pub verbose: bool,
243
244    #[arg(long = "list", help = "show list of total supported formats")]
245    pub list: bool,
246
247    #[arg(
248        long = "skipLocal",
249        help = "skip duplicates in local folders, just detect cross folders duplications"
250    )]
251    pub skip_local: bool,
252
253    #[arg(
254        long = "exitCode",
255        value_name = "number",
256        num_args = 0..=1,
257        default_missing_value = "__jscpd_rs_bare_exit_code_true__",
258        help = "exit code to use when code duplications are detected"
259    )]
260    pub exit_code: Option<String>,
261
262    #[arg(
263        long = "noTips",
264        help = "do not print tips and promotional messages after detection"
265    )]
266    pub no_tips: bool,
267
268    #[arg(
269        long = "skipComments",
270        help = "ignore comments during detection (alias for --mode weak)"
271    )]
272    pub skip_comments: bool,
273
274    #[arg(
275        long = "ignore-pattern",
276        value_name = "string",
277        num_args = 0..=1,
278        default_missing_value = BARE_STRING_VALUE,
279        help = "Ignore code blocks matching the regexp patterns"
280    )]
281    pub ignore_pattern: Option<String>,
282
283    #[arg(
284        long = "formats-exts",
285        value_name = "string",
286        num_args = 0..=1,
287        default_missing_value = BARE_STRING_VALUE,
288        help = "list of formats with file extensions (javascript:es,es6;dart:dt)"
289    )]
290    pub formats_exts: Option<String>,
291
292    #[arg(
293        long = "formats-names",
294        value_name = "string",
295        num_args = 0..=1,
296        default_missing_value = BARE_STRING_VALUE,
297        help = "list of formats with specific filenames (makefile:Makefile,GNUmakefile;docker:Dockerfile)"
298    )]
299    pub formats_names: Option<String>,
300}
301
302#[derive(Clone, Copy, Debug, PartialEq, Eq)]
303pub enum Mode {
304    Strict,
305    Mild,
306    Weak,
307}
308
309#[derive(Clone, Debug, PartialEq)]
310pub enum ExitCode {
311    Number(f64),
312    String(String),
313    Boolean(bool),
314}
315
316impl ExitCode {
317    fn from_cli(value: String) -> Self {
318        if value == BARE_EXIT_CODE_VALUE {
319            Self::Boolean(true)
320        } else {
321            Self::String(value)
322        }
323    }
324}
325
326/// Normalized detector options shared by the CLI, server, and Rust API.
327///
328/// Values are usually created with `get_default_options`,
329/// `get_options_from_args`, or the `jscpd` binary. Fields are public so native
330/// integrations can construct focused option sets without going through CLI
331/// parsing.
332#[derive(Debug, Clone)]
333pub struct Options {
334    pub execution_id: Option<String>,
335    pub config: Option<PathBuf>,
336    pub paths: Vec<PathBuf>,
337    pub pattern: String,
338    pub ignore: Vec<String>,
339    pub reporters: Vec<String>,
340    pub listeners: Vec<String>,
341    pub reporters_options: serde_json::Map<String, serde_json::Value>,
342    pub output: PathBuf,
343    pub output_is_bare: bool,
344    pub formats: Option<HashSet<String>>,
345    pub format_order: Option<Vec<String>>,
346    pub formats_exts: FormatMappings,
347    pub formats_names: FormatMappings,
348    pub ignore_pattern: Vec<Regex>,
349    pub min_lines: usize,
350    pub min_tokens: usize,
351    pub max_lines: usize,
352    pub max_size_bytes: u64,
353    pub threshold: Option<f64>,
354    pub mode: Mode,
355    pub store: Option<String>,
356    pub store_path: Option<PathBuf>,
357    pub blame: bool,
358    pub cache: bool,
359    pub silent: bool,
360    pub absolute: bool,
361    pub no_symlinks: bool,
362    pub ignore_case: bool,
363    pub gitignore: bool,
364    pub debug: bool,
365    pub verbose: bool,
366    pub skip_local: bool,
367    pub exit_code: ExitCode,
368    pub no_tips: bool,
369    pub tokens_to_skip: Vec<String>,
370}
371
372/// Additional format mappings from extensions or exact filenames to formats.
373///
374/// This is the Rust API counterpart of the CLI `--formats-exts` and
375/// `--formats-names` options.
376#[derive(Clone, Debug, Default, PartialEq, Eq)]
377pub struct FormatMappings(Vec<(String, Vec<String>)>);
378
379impl FormatMappings {
380    /// Build mappings from `(format, values)` pairs.
381    pub fn from_pairs<I, S, V, T>(pairs: I) -> Self
382    where
383        I: IntoIterator<Item = (S, V)>,
384        S: Into<String>,
385        V: IntoIterator<Item = T>,
386        T: Into<String>,
387    {
388        Self(
389            pairs
390                .into_iter()
391                .map(|(format, values)| {
392                    (format.into(), values.into_iter().map(Into::into).collect())
393                })
394                .collect(),
395        )
396    }
397
398    /// Return true when no mappings are defined.
399    pub fn is_empty(&self) -> bool {
400        self.0.is_empty()
401    }
402
403    /// Iterate through `(format, values)` pairs.
404    pub fn iter(&self) -> impl Iterator<Item = (&String, &Vec<String>)> {
405        self.0.iter().map(|(format, values)| (format, values))
406    }
407
408    /// Return the format mapped to an extension or exact filename.
409    pub fn find_format_for_value(&self, value: &str) -> Option<&str> {
410        self.0.iter().find_map(|(format, values)| {
411            values
412                .iter()
413                .any(|item| item == value)
414                .then_some(format.as_str())
415        })
416    }
417}
418
419impl Default for Options {
420    fn default() -> Self {
421        Self {
422            execution_id: Some(default_execution_id()),
423            config: None,
424            paths: vec![std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."))],
425            pattern: "**/*".to_string(),
426            ignore: Vec::new(),
427            reporters: vec!["console".to_string()],
428            listeners: Vec::new(),
429            reporters_options: serde_json::Map::new(),
430            output: PathBuf::from("./report"),
431            output_is_bare: false,
432            formats: None,
433            format_order: None,
434            formats_exts: FormatMappings::default(),
435            formats_names: FormatMappings::default(),
436            ignore_pattern: Vec::new(),
437            min_lines: 5,
438            min_tokens: 50,
439            max_lines: 1000,
440            max_size_bytes: 100 * 1024,
441            threshold: None,
442            mode: Mode::Mild,
443            store: None,
444            store_path: None,
445            blame: false,
446            cache: true,
447            silent: false,
448            absolute: false,
449            no_symlinks: false,
450            ignore_case: false,
451            gitignore: true,
452            debug: false,
453            verbose: false,
454            skip_local: false,
455            exit_code: ExitCode::Number(0.0),
456            no_tips: std::env::var_os("CI").is_some(),
457            tokens_to_skip: Vec::new(),
458        }
459    }
460}
461
462fn default_execution_id() -> String {
463    OffsetDateTime::now_utc()
464        .format(&Rfc3339)
465        .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
466}
467
468impl Options {
469    pub fn from_args<I, T>(args: I) -> Result<Self>
470    where
471        I: IntoIterator<Item = T>,
472        T: Into<OsString> + Clone,
473    {
474        let cli = Cli::try_parse_from(args)?;
475        Self::from_cli(cli)
476    }
477
478    pub fn from_cli(cli: Cli) -> Result<Self> {
479        let mut options = Self::default();
480
481        if matches!(cli.config.as_deref(), Some(path) if path == std::path::Path::new(BARE_CONFIG_VALUE))
482        {
483            bail!(
484                "TypeError [ERR_INVALID_ARG_TYPE]: The \"paths[0]\" argument must be of type string. Received type boolean (true)"
485            );
486        }
487
488        if let Some((config, config_dir, config_path)) = read_package_json_config()? {
489            options.config = Some(config_path);
490            apply_config(&mut options, config, &config_dir)?;
491        }
492        if let Some((config, config_dir, config_path)) = read_config(cli.config.as_deref())? {
493            options.config = Some(config_path);
494            apply_config(&mut options, config, &config_dir)?;
495        }
496
497        if !cli.paths.is_empty() {
498            options.paths = cli.paths;
499        }
500        if let Some(pattern) = cli.pattern {
501            options.pattern = pattern;
502        }
503        if let Some(ignore) = cli.ignore {
504            if is_bare_string(&ignore) {
505                bail!("TypeError: cli.ignore.split is not a function");
506            }
507            options.ignore = split_csv(&ignore);
508        }
509        if let Some(reporters) = cli.reporters {
510            if is_bare_string(&reporters) {
511                bail!("TypeError: cli.reporters.split is not a function");
512            }
513            options.reporters = split_csv(&reporters);
514        }
515        if let Some(output) = cli.output {
516            if is_bare_string(&output) {
517                options.output = PathBuf::from("true");
518                options.output_is_bare = true;
519            } else {
520                options.output = PathBuf::from(output);
521                options.output_is_bare = false;
522            }
523        }
524        if let Some(format) = cli.format {
525            if is_bare_string(&format) {
526                bail!("TypeError: cli.format.split is not a function");
527            }
528            let formats = split_csv(&format);
529            options.formats = Some(formats.iter().cloned().collect());
530            options.format_order = Some(formats);
531        }
532        if let Some(formats_exts) = cli.formats_exts {
533            if is_bare_string(&formats_exts) {
534                bail!("TypeError: extensions.split is not a function");
535            }
536            options.formats_exts = parse_format_mappings_like_upstream(&formats_exts)?;
537        }
538        if let Some(formats_names) = cli.formats_names {
539            if is_bare_string(&formats_names) {
540                bail!("TypeError: extensions.split is not a function");
541            }
542            options.formats_names = parse_format_mappings_like_upstream(&formats_names)?;
543        }
544        if let Some(ignore_pattern) = cli.ignore_pattern {
545            if is_bare_string(&ignore_pattern) {
546                bail!("TypeError: cli.ignorePattern.split is not a function");
547            }
548            options.ignore_pattern = compile_patterns(split_csv(&ignore_pattern))
549                .context("invalid --ignore-pattern value")?;
550        }
551        if let Some(min_lines) = cli.min_lines {
552            options.min_lines = min_lines;
553        }
554        if let Some(min_tokens) = cli.min_tokens {
555            options.min_tokens = min_tokens;
556        }
557        if let Some(max_lines) = cli.max_lines {
558            options.max_lines = max_lines;
559        }
560        if let Some(max_size) = cli.max_size {
561            options.max_size_bytes = parse_size(&max_size)
562                .with_context(|| format!("invalid --max-size value `{max_size}`"))?;
563        }
564        if let Some(threshold) = cli.threshold {
565            options.threshold = Some(threshold);
566        }
567        if let Some(mode) = cli.mode.as_deref() {
568            if is_bare_string(mode) {
569                bail!("TypeError: mode is not a function");
570            }
571            options.mode = parse_mode(mode)?;
572        }
573        if cli.skip_comments && cli.mode.is_none() {
574            options.mode = Mode::Weak;
575        }
576        if let Some(store) = cli.store {
577            options.store = Some(store);
578        }
579        if let Some(store_path) = cli.store_path {
580            options.store_path = Some(store_path);
581        }
582        if cli.blame {
583            options.blame = true;
584        }
585        if cli.silent {
586            options.silent = true;
587        }
588        if cli.absolute {
589            options.absolute = true;
590        }
591        if cli.no_symlinks {
592            options.no_symlinks = true;
593        }
594        if cli.ignore_case {
595            options.ignore_case = true;
596        }
597        if cli.no_gitignore {
598            options.gitignore = false;
599        } else if cli.gitignore {
600            options.gitignore = true;
601        }
602        if cli.debug {
603            options.debug = true;
604        }
605        if cli.verbose {
606            options.verbose = true;
607        }
608        if cli.skip_local {
609            options.skip_local = true;
610        }
611        if let Some(exit_code) = cli.exit_code {
612            options.exit_code = ExitCode::from_cli(exit_code);
613        }
614        if cli.no_tips {
615            options.no_tips = true;
616        }
617
618        apply_cwd_gitignore_patterns(&mut options)?;
619        normalize_reporters(&mut options);
620
621        Ok(options)
622    }
623}
624
625fn is_bare_string(value: &str) -> bool {
626    value == BARE_STRING_VALUE
627}
628
629pub fn resolve_node_exit_code(exit_code: &ExitCode) -> std::result::Result<i32, String> {
630    parsing::node_exit_code(exit_code).map_err(|error| error.message())
631}
632
633pub fn store_warning(options: &Options) -> Option<String> {
634    options
635        .store
636        .as_ref()
637        .map(|store| format!("store name {store} not installed."))
638}
639
640pub(super) fn parse_mode(value: &str) -> Result<Mode> {
641    match value {
642        "strict" => Ok(Mode::Strict),
643        "mild" => Ok(Mode::Mild),
644        "weak" => Ok(Mode::Weak),
645        _ => bail!("Mode {value} does not supported yet."),
646    }
647}
648
649fn normalize_reporters(options: &mut Options) {
650    if options.silent {
651        options
652            .reporters
653            .retain(|reporter| !reporter.contains("console"));
654        options.reporters.push("silent".to_string());
655    }
656    if options.threshold.is_some() {
657        options.reporters.push("threshold".to_string());
658    }
659}
660
661fn apply_cwd_gitignore_patterns(options: &mut Options) -> Result<()> {
662    let cwd = std::env::current_dir().context("failed to resolve current directory")?;
663    apply_gitignore_patterns_from(options, &cwd);
664    Ok(())
665}
666
667fn apply_gitignore_patterns_from(options: &mut Options, cwd: &std::path::Path) {
668    if options.gitignore {
669        options.ignore.extend(collect_cwd_gitignore_patterns(cwd));
670    }
671}