Skip to main content

jscpd_rs/
lib.rs

1#![doc(html_root_url = "https://docs.rs/jscpd-rs/0.1.2")]
2
3//! Native Rust API for `jscpd-rs`, a 50x+ faster duplicate-code detector for
4//! local development and CI/CD.
5//!
6//! `jscpd-rs` scans a codebase, finds copy-paste fragments across files, writes
7//! console, JSON, SARIF, HTML, XML, CSV, Markdown, badge, and Xcode reports,
8//! and can fail a build when duplication crosses a configured threshold.
9//!
10//! It is a native Rust implementation of the common
11//! [`jscpd`](https://github.com/kucherenko/jscpd) command-line workflow:
12//! upstream-style CLI flags, `.jscpd.json` and `package.json#jscpd`
13//! configuration, report formats, exit-code behavior, Git blame, and server
14//! snippet checks. The current public benchmark suite records 50x+ speedups on
15//! pinned React, Next.js, and Prometheus cases while using a coverage-first
16//! compatibility gate against upstream `jscpd`.
17//!
18//! This crate exposes the same detector core used by the `jscpd` and
19//! `jscpd-server` binaries: option parsing, file discovery, tokenization,
20//! duplicate detection, statistics, and in-memory source checks.
21//!
22//! # Quick Start
23//!
24//! Scan paths using the same option model as the CLI:
25//!
26//! ```no_run
27//! use std::path::PathBuf;
28//!
29//! # fn main() -> anyhow::Result<()> {
30//! let mut options = jscpd_rs::get_default_options();
31//! options.paths = vec![PathBuf::from("src")];
32//! options.reporters.clear();
33//! options.silent = true;
34//!
35//! let result = jscpd_rs::detect_clones_and_statistics(&options)?;
36//! println!("{} clones", result.clones.len());
37//! # Ok(())
38//! # }
39//! ```
40//!
41//! Check prepared in-memory sources without touching the filesystem:
42//!
43//! ```
44//! let mut options = jscpd_rs::get_default_options();
45//! options.reporters.clear();
46//! options.min_lines = 2;
47//! options.min_tokens = 5;
48//!
49//! let files = vec![
50//!     jscpd_rs::SourceFile {
51//!         source_id: "a.js".to_string(),
52//!         format: "javascript".to_string(),
53//!         content: "const a = 1;\nconst b = 2;\nconst c = a + b;\n".to_string(),
54//!     },
55//!     jscpd_rs::SourceFile {
56//!         source_id: "b.js".to_string(),
57//!         format: "javascript".to_string(),
58//!         content: "const a = 1;\nconst b = 2;\nconst c = a + b;\n".to_string(),
59//!     },
60//! ];
61//!
62//! let result = jscpd_rs::detect_source_files(files, &options);
63//! assert!(!result.clones.is_empty());
64//! ```
65//!
66//! # Main Entry Points
67//!
68//! - [`get_options_from_args`] parses upstream-style CLI arguments into
69//!   [`Options`].
70//! - [`detect_clones`] and [`detect_clones_and_statistics`] run discovery,
71//!   tokenization, duplicate detection, statistics, and optional Git blame.
72//! - [`detect_source_files`] runs detection against caller-provided
73//!   [`SourceFile`] values and is the best entry point for editors, servers,
74//!   and tests.
75//! - [`Tokenizer`] exposes the native token map generator used by the detector.
76//! - [`Detector`] and [`MemoryStore`] provide Rust counterparts for the main
77//!   upstream core classes.
78//! - [`jscpd`] and [`jscpd_with_exit_callback`] provide an embeddable argv
79//!   runner similar to upstream `jscpd(argv, exitCallback?)`.
80//!
81//! # Compatibility Model
82//!
83//! The release gate is coverage-first: for the same inputs and options, this
84//! crate must not miss duplicated source lines reported by upstream `jscpd`.
85//! Extra Rust findings remain visible in compatibility reports while the
86//! implementation converges on exact parity.
87//!
88//! The first release intentionally keeps the detector native-only. Dynamic npm
89//! reporters, stores, listeners, and plugins are not loaded by this crate.
90//!
91//! See the
92//! [README](https://github.com/vv-bogdanov/jscpd-rs#readme) and
93//! [User Guide](https://github.com/vv-bogdanov/jscpd-rs/blob/main/docs/user-guide.md)
94//! for CLI, configuration, reporter, server, and CI examples.
95
96mod app;
97mod blame;
98mod cli;
99mod detector;
100mod files;
101mod formats;
102mod report;
103pub mod server;
104mod tokenizer;
105mod verbose;
106
107use std::{ffi::OsString, path::Path};
108
109use anyhow::Result;
110
111pub use app::{
112    JscpdOutcome, jscpd, jscpd_with_exit_callback, run_cli_args, run_current_process,
113    upstream_stdout_error,
114};
115pub use cli::{Cli, ExitCode, FormatMappings, Mode, Options};
116pub use detector::{
117    BlamedLine, BlamedLines, CloneMatch, DetectionResult, Detector, Fragment, MemoryStore,
118    MemoryStoreError, SkippedClone, SourceSummary, Statistic, StatisticRow, Statistics,
119};
120pub use files::SourceFile;
121pub use report::ThresholdExceeded;
122pub use tokenizer::{DetectionToken, Location, SourceTokenMap, TokenMap, Tokenizer};
123
124/// Return the upstream-compatible default option set.
125///
126/// The defaults match the CLI defaults used by the `jscpd` binary: all
127/// supported formats, `min_lines = 5`, `min_tokens = 50`, `max_lines = 1000`,
128/// `max_size = 100kb`, Git ignore handling enabled, and the console reporter
129/// selected.
130pub fn get_default_options() -> Options {
131    Options::default()
132}
133
134/// Parse upstream-style command-line arguments into normalized [`Options`].
135///
136/// The first argument should be the binary name, just like `std::env::args`.
137/// This is useful for native integrations that want the same option semantics
138/// as the CLI without spawning a process.
139pub fn get_options_from_args<I, T>(args: I) -> Result<Options>
140where
141    I: IntoIterator<Item = T>,
142    T: Into<OsString> + Clone,
143{
144    Options::from_args(args)
145}
146
147/// Return the names of all formats known to the synchronized format registry.
148///
149/// The first release keeps the registry aligned with upstream `jscpd`; high
150/// volume JS/TS formats use native Oxc-backed tokenization and long-tail
151/// formats use the generic native tokenizer unless promoted by compatibility
152/// evidence.
153pub fn get_supported_formats() -> Vec<&'static str> {
154    formats::supported_formats()
155}
156
157/// Resolve a source format from a path using the built-in extension and
158/// filename registry.
159pub fn get_format_by_file(path: impl AsRef<Path>) -> Option<String> {
160    get_format_by_file_with_mappings(path, &FormatMappings::default(), &FormatMappings::default())
161}
162
163/// Resolve a source format from a path with caller-provided extension and
164/// filename mappings.
165///
166/// This mirrors the CLI `--formats-exts` and `--formats-names` options.
167pub fn get_format_by_file_with_mappings(
168    path: impl AsRef<Path>,
169    formats_exts: &FormatMappings,
170    formats_names: &FormatMappings,
171) -> Option<String> {
172    formats::format_for_path(path.as_ref(), formats_exts, formats_names).map(str::to_string)
173}
174
175/// Detect clones from files discovered through [`Options::paths`].
176///
177/// This is the compact path-based API when callers only need clone matches and
178/// not the full statistics object.
179pub fn detect_clones(options: &Options) -> Result<Vec<CloneMatch>> {
180    Ok(detect_clones_and_statistics(options)?.clones)
181}
182
183/// Upstream-named alias for [`detect_clones_and_statistics`].
184///
185/// The singular `statistic` spelling is kept for callers porting from upstream
186/// JavaScript APIs and examples.
187pub fn detect_clones_and_statistic(options: &Options) -> Result<DetectionResult> {
188    detect_clones_and_statistics(options)
189}
190
191/// Detect clones and return both clone matches and aggregate statistics.
192///
193/// This entry point performs ignore-aware file discovery from [`Options::paths`]
194/// before delegating to the native detector. Use [`detect_source_files`] when
195/// the caller already has source contents in memory.
196pub fn detect_clones_and_statistics(options: &Options) -> Result<DetectionResult> {
197    let files = files::discover(options)?;
198    Ok(detect_source_files(files, options))
199}
200
201/// Detect clones in prepared in-memory sources.
202///
203/// This is the lowest-friction API for editor integrations, tests, snippets,
204/// and services that already own source contents. The `format` field on each
205/// [`SourceFile`] should contain one of the names returned by
206/// [`get_supported_formats`].
207pub fn detect_source_files(files: Vec<SourceFile>, options: &Options) -> DetectionResult {
208    let mut result = detector::detect(files, options);
209    if options.blame {
210        blame::apply_blame(&mut result);
211    }
212    result
213}