Skip to main content

jscpd_rs/detector/
model.rs

1use std::collections::{BTreeMap, HashMap};
2use std::sync::Arc;
3
4use serde::Serialize;
5
6use crate::tokenizer::Location;
7
8/// Git blame lines keyed by line number.
9pub type BlamedLines = BTreeMap<String, BlamedLine>;
10
11/// Git blame information for one duplicated source line.
12#[derive(Clone, Debug, Serialize)]
13pub struct BlamedLine {
14    /// Commit revision.
15    pub rev: String,
16    /// Author name reported by Git.
17    pub author: String,
18    /// Author or commit date reported by Git.
19    pub date: String,
20    /// Source line text.
21    pub line: String,
22}
23
24#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
25pub(super) struct SourceId(pub(super) usize);
26
27#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
28pub(super) struct FormatId(pub(super) usize);
29
30/// One duplicated fragment in a source file.
31#[derive(Clone, Debug, Serialize)]
32pub struct Fragment {
33    #[serde(rename = "sourceId")]
34    /// Source identifier, usually a path.
35    pub source_id: String,
36    /// Start location of the duplicated fragment.
37    pub start: Location,
38    /// End location of the duplicated fragment.
39    pub end: Location,
40    /// Byte range of the duplicated fragment.
41    pub range: [usize; 2],
42    #[serde(skip_serializing_if = "Option::is_none")]
43    /// Optional Git blame information keyed by line number.
44    pub blame: Option<BlamedLines>,
45}
46
47/// Pair of duplicated fragments reported as one clone.
48#[derive(Clone, Debug, Serialize)]
49pub struct CloneMatch {
50    /// Format name shared by both fragments.
51    pub format: String,
52    #[serde(rename = "duplicationA")]
53    /// First duplicated fragment.
54    pub duplication_a: Fragment,
55    #[serde(rename = "duplicationB")]
56    /// Second duplicated fragment.
57    pub duplication_b: Fragment,
58    /// Number of detection tokens in the clone.
59    pub tokens: usize,
60}
61
62/// Clone skipped from final output with compatibility/debug messages.
63#[derive(Clone, Debug)]
64pub struct SkippedClone {
65    /// Skipped clone candidate.
66    pub clone: CloneMatch,
67    /// Reason messages explaining why the clone was skipped.
68    pub message: Vec<String>,
69}
70
71/// Aggregated duplication counters for a source, format, or whole run.
72#[derive(Clone, Debug, Default, Serialize)]
73pub struct StatisticRow {
74    /// Total line count.
75    pub lines: usize,
76    /// Total token count.
77    pub tokens: usize,
78    /// Number of sources included in the row.
79    pub sources: usize,
80    /// Number of clone pairs.
81    pub clones: usize,
82    #[serde(rename = "duplicatedLines")]
83    /// Number of lines covered by at least one clone.
84    pub duplicated_lines: usize,
85    #[serde(rename = "duplicatedTokens")]
86    /// Number of duplicated tokens.
87    pub duplicated_tokens: usize,
88    /// Duplicated line percentage.
89    pub percentage: f64,
90    #[serde(rename = "percentageTokens")]
91    /// Duplicated token percentage.
92    pub percentage_tokens: f64,
93    #[serde(rename = "newDuplicatedLines")]
94    /// New duplicated line count, kept for upstream report shape.
95    pub new_duplicated_lines: usize,
96    #[serde(rename = "newClones")]
97    /// New clone count, kept for upstream report shape.
98    pub new_clones: usize,
99}
100
101/// Duplication statistics grouped by format.
102#[derive(Clone, Debug, Default, Serialize)]
103pub struct FormatStatistic {
104    /// Per-source statistics for this format.
105    pub sources: HashMap<String, StatisticRow>,
106    /// Total statistics for this format.
107    pub total: StatisticRow,
108}
109
110/// Duplication statistics for a full detection run.
111#[derive(Clone, Debug, Default, Serialize)]
112pub struct Statistics {
113    /// Total statistics across all formats.
114    pub total: StatisticRow,
115    /// Statistics grouped by format name.
116    pub formats: HashMap<String, FormatStatistic>,
117}
118
119/// Summary of one analyzed source.
120#[derive(Clone, Debug, Serialize)]
121pub struct SourceSummary {
122    /// Source path or identifier.
123    pub path: String,
124    /// Detected or assigned format.
125    pub format: String,
126    /// Source line count.
127    pub lines: usize,
128    /// Detection token count.
129    pub tokens: usize,
130}
131
132/// Complete detector output.
133#[derive(Clone, Debug, Serialize)]
134pub struct DetectionResult {
135    /// Reported clone pairs.
136    pub clones: Vec<CloneMatch>,
137    #[serde(skip)]
138    /// Clone candidates skipped from final reports.
139    pub skipped_clones: Vec<SkippedClone>,
140    /// Aggregate statistics.
141    pub statistics: Statistics,
142    /// Analyzed source summaries.
143    pub sources: Vec<SourceSummary>,
144    #[serde(skip)]
145    /// Source contents keyed by source identifier for reporters that need
146    /// fragments.
147    pub source_contents: HashMap<String, String>,
148}
149
150#[derive(Clone, Debug)]
151pub(super) struct TokenSpan {
152    pub(super) start: Location,
153    pub(super) end: Location,
154    pub(super) range: [usize; 2],
155}
156
157#[derive(Clone, Debug)]
158pub(super) struct SourceMeta {
159    pub(super) source_id: String,
160    pub(super) format: String,
161    pub(super) lines: usize,
162    pub(super) tokens: usize,
163}
164
165#[derive(Clone, Debug)]
166pub(super) struct TokenStream {
167    pub(super) source_id: SourceId,
168    pub(super) format_id: FormatId,
169    pub(super) hashes: Vec<u64>,
170    pub(super) spans: Vec<TokenSpan>,
171}
172
173#[derive(Clone, Copy, Debug)]
174pub(super) struct Occurrence {
175    pub(super) source_id: SourceId,
176    pub(super) token_start: usize,
177}
178
179#[derive(Clone, Debug)]
180pub(super) struct PreparedSource {
181    pub(super) meta: SourceMeta,
182    pub(super) stream: TokenStream,
183}
184
185#[derive(Clone, Debug)]
186pub(crate) struct PreparedSourceDraft {
187    pub(super) meta: SourceMeta,
188    pub(super) content: Arc<str>,
189    pub(super) hashes: Arc<Vec<u64>>,
190    pub(super) spans: Arc<Vec<TokenSpan>>,
191}