1use schemars::JsonSchema;
2use serde::{Deserialize, Serialize};
3
4const fn default_true() -> bool {
5 true
6}
7
8const fn default_min_tokens() -> usize {
9 50
10}
11
12const fn default_min_lines() -> usize {
13 5
14}
15
16const fn default_min_corpus_size_for_shingle_filter() -> usize {
17 1024
18}
19
20const fn default_min_corpus_size_for_token_cache() -> usize {
21 5_000
22}
23
24#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
26#[serde(rename_all = "camelCase")]
27pub struct DuplicatesConfig {
28 #[serde(default = "default_true")]
30 pub enabled: bool,
31
32 #[serde(default)]
34 pub mode: DetectionMode,
35
36 #[serde(default = "default_min_tokens")]
38 pub min_tokens: usize,
39
40 #[serde(default = "default_min_lines")]
42 pub min_lines: usize,
43
44 #[serde(default)]
46 pub threshold: f64,
47
48 #[serde(default)]
50 pub ignore: Vec<String>,
51
52 #[serde(default = "default_true")]
56 pub ignore_defaults: bool,
57
58 #[serde(default)]
60 pub skip_local: bool,
61
62 #[serde(default)]
68 pub cross_language: bool,
69
70 #[serde(default)]
78 pub ignore_imports: bool,
79
80 #[serde(default)]
82 pub normalization: NormalizationConfig,
83
84 #[serde(default = "default_min_corpus_size_for_shingle_filter")]
87 pub min_corpus_size_for_shingle_filter: usize,
88
89 #[serde(default = "default_min_corpus_size_for_token_cache")]
94 pub min_corpus_size_for_token_cache: usize,
95}
96
97impl Default for DuplicatesConfig {
98 fn default() -> Self {
99 Self {
100 enabled: true,
101 mode: DetectionMode::default(),
102 min_tokens: default_min_tokens(),
103 min_lines: default_min_lines(),
104 threshold: 0.0,
105 ignore: vec![],
106 ignore_defaults: true,
107 skip_local: false,
108 cross_language: false,
109 ignore_imports: false,
110 normalization: NormalizationConfig::default(),
111 min_corpus_size_for_shingle_filter: default_min_corpus_size_for_shingle_filter(),
112 min_corpus_size_for_token_cache: default_min_corpus_size_for_token_cache(),
113 }
114 }
115}
116
117#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
123#[serde(rename_all = "camelCase")]
124pub struct NormalizationConfig {
125 #[serde(default, skip_serializing_if = "Option::is_none")]
128 pub ignore_identifiers: Option<bool>,
129
130 #[serde(default, skip_serializing_if = "Option::is_none")]
133 pub ignore_string_values: Option<bool>,
134
135 #[serde(default, skip_serializing_if = "Option::is_none")]
138 pub ignore_numeric_values: Option<bool>,
139}
140
141#[derive(Debug, Clone, Copy, PartialEq, Eq)]
143pub struct ResolvedNormalization {
144 pub ignore_identifiers: bool,
145 pub ignore_string_values: bool,
146 pub ignore_numeric_values: bool,
147}
148
149impl ResolvedNormalization {
150 #[must_use]
152 pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
153 let (default_ids, default_strings, default_numbers) = match mode {
154 DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
155 DetectionMode::Weak => (false, true, false),
156 DetectionMode::Semantic => (true, true, true),
157 };
158
159 Self {
160 ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
161 ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
162 ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
163 }
164 }
165}
166
167#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
175#[serde(rename_all = "lowercase")]
176pub enum DetectionMode {
177 Strict,
179 #[default]
181 Mild,
182 Weak,
184 Semantic,
186}
187
188impl std::fmt::Display for DetectionMode {
189 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190 match self {
191 Self::Strict => write!(f, "strict"),
192 Self::Mild => write!(f, "mild"),
193 Self::Weak => write!(f, "weak"),
194 Self::Semantic => write!(f, "semantic"),
195 }
196 }
197}
198
199impl std::str::FromStr for DetectionMode {
200 type Err = String;
201
202 fn from_str(s: &str) -> Result<Self, Self::Err> {
203 match s.to_lowercase().as_str() {
204 "strict" => Ok(Self::Strict),
205 "mild" => Ok(Self::Mild),
206 "weak" => Ok(Self::Weak),
207 "semantic" => Ok(Self::Semantic),
208 other => Err(format!("unknown detection mode: '{other}'")),
209 }
210 }
211}
212
213#[cfg(test)]
214mod tests {
215 use super::*;
216
217 #[test]
220 fn duplicates_config_defaults() {
221 let config = DuplicatesConfig::default();
222 assert!(config.enabled);
223 assert_eq!(config.mode, DetectionMode::Mild);
224 assert_eq!(config.min_tokens, 50);
225 assert_eq!(config.min_lines, 5);
226 assert!((config.threshold - 0.0).abs() < f64::EPSILON);
227 assert!(config.ignore.is_empty());
228 assert!(config.ignore_defaults);
229 assert!(!config.skip_local);
230 assert!(!config.cross_language);
231 assert!(!config.ignore_imports);
232 assert_eq!(config.min_corpus_size_for_shingle_filter, 1024);
233 assert_eq!(config.min_corpus_size_for_token_cache, 5_000);
234 }
235
236 #[test]
239 fn detection_mode_from_str_all_variants() {
240 assert_eq!(
241 "strict".parse::<DetectionMode>().unwrap(),
242 DetectionMode::Strict
243 );
244 assert_eq!(
245 "mild".parse::<DetectionMode>().unwrap(),
246 DetectionMode::Mild
247 );
248 assert_eq!(
249 "weak".parse::<DetectionMode>().unwrap(),
250 DetectionMode::Weak
251 );
252 assert_eq!(
253 "semantic".parse::<DetectionMode>().unwrap(),
254 DetectionMode::Semantic
255 );
256 }
257
258 #[test]
259 fn detection_mode_from_str_case_insensitive() {
260 assert_eq!(
261 "STRICT".parse::<DetectionMode>().unwrap(),
262 DetectionMode::Strict
263 );
264 assert_eq!(
265 "Weak".parse::<DetectionMode>().unwrap(),
266 DetectionMode::Weak
267 );
268 assert_eq!(
269 "SEMANTIC".parse::<DetectionMode>().unwrap(),
270 DetectionMode::Semantic
271 );
272 }
273
274 #[test]
275 fn detection_mode_from_str_unknown() {
276 let err = "foobar".parse::<DetectionMode>().unwrap_err();
277 assert!(err.contains("unknown detection mode"));
278 assert!(err.contains("foobar"));
279 }
280
281 #[test]
284 fn detection_mode_display() {
285 assert_eq!(DetectionMode::Strict.to_string(), "strict");
286 assert_eq!(DetectionMode::Mild.to_string(), "mild");
287 assert_eq!(DetectionMode::Weak.to_string(), "weak");
288 assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
289 }
290
291 #[test]
294 fn resolve_strict_mode_all_false() {
295 let resolved =
296 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
297 assert!(!resolved.ignore_identifiers);
298 assert!(!resolved.ignore_string_values);
299 assert!(!resolved.ignore_numeric_values);
300 }
301
302 #[test]
303 fn resolve_mild_mode_all_false() {
304 let resolved =
305 ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
306 assert!(!resolved.ignore_identifiers);
307 assert!(!resolved.ignore_string_values);
308 assert!(!resolved.ignore_numeric_values);
309 }
310
311 #[test]
312 fn resolve_weak_mode_only_strings_true() {
313 let resolved =
314 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
315 assert!(!resolved.ignore_identifiers);
316 assert!(resolved.ignore_string_values);
317 assert!(!resolved.ignore_numeric_values);
318 }
319
320 #[test]
321 fn resolve_semantic_mode_all_true() {
322 let resolved = ResolvedNormalization::resolve(
323 DetectionMode::Semantic,
324 &NormalizationConfig::default(),
325 );
326 assert!(resolved.ignore_identifiers);
327 assert!(resolved.ignore_string_values);
328 assert!(resolved.ignore_numeric_values);
329 }
330
331 #[test]
332 fn resolve_override_forces_true() {
333 let overrides = NormalizationConfig {
335 ignore_identifiers: Some(true),
336 ignore_string_values: None,
337 ignore_numeric_values: None,
338 };
339 let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
340 assert!(resolved.ignore_identifiers);
341 assert!(!resolved.ignore_string_values);
342 assert!(!resolved.ignore_numeric_values);
343 }
344
345 #[test]
346 fn resolve_override_forces_false() {
347 let overrides = NormalizationConfig {
349 ignore_identifiers: Some(false),
350 ignore_string_values: Some(false),
351 ignore_numeric_values: None,
352 };
353 let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
354 assert!(!resolved.ignore_identifiers);
355 assert!(!resolved.ignore_string_values);
356 assert!(resolved.ignore_numeric_values); }
358
359 #[test]
360 fn resolve_all_overrides_on_weak() {
361 let overrides = NormalizationConfig {
362 ignore_identifiers: Some(true),
363 ignore_string_values: Some(false), ignore_numeric_values: Some(true),
365 };
366 let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
367 assert!(resolved.ignore_identifiers);
368 assert!(!resolved.ignore_string_values); assert!(resolved.ignore_numeric_values);
370 }
371
372 #[test]
375 fn duplicates_config_json_all_fields() {
376 let json = r#"{
377 "enabled": false,
378 "mode": "semantic",
379 "minTokens": 100,
380 "minLines": 10,
381 "threshold": 5.0,
382 "ignore": ["**/vendor/**"],
383 "ignoreDefaults": false,
384 "skipLocal": true,
385 "crossLanguage": true,
386 "ignoreImports": true
387 }"#;
388 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
389 assert!(!config.enabled);
390 assert_eq!(config.mode, DetectionMode::Semantic);
391 assert_eq!(config.min_tokens, 100);
392 assert_eq!(config.min_lines, 10);
393 assert!((config.threshold - 5.0).abs() < f64::EPSILON);
394 assert_eq!(config.ignore, vec!["**/vendor/**"]);
395 assert!(!config.ignore_defaults);
396 assert!(config.skip_local);
397 assert!(config.cross_language);
398 assert!(config.ignore_imports);
399 }
400
401 #[test]
402 fn duplicates_config_json_partial_uses_defaults() {
403 let json = r#"{"mode": "weak"}"#;
404 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
405 assert!(config.enabled); assert_eq!(config.mode, DetectionMode::Weak);
407 assert_eq!(config.min_tokens, 50); assert_eq!(config.min_lines, 5); assert!(config.ignore_defaults);
410 }
411
412 #[test]
413 fn duplicates_config_json_ignore_defaults_merges_by_default() {
414 let json = r#"{"ignore": ["**/foo/**"]}"#;
415 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
416 assert_eq!(config.ignore, vec!["**/foo/**"]);
417 assert!(config.ignore_defaults);
418 }
419
420 #[test]
421 fn normalization_config_json_overrides() {
422 let json = r#"{
423 "ignoreIdentifiers": true,
424 "ignoreStringValues": false
425 }"#;
426 let config: NormalizationConfig = serde_json::from_str(json).unwrap();
427 assert_eq!(config.ignore_identifiers, Some(true));
428 assert_eq!(config.ignore_string_values, Some(false));
429 assert_eq!(config.ignore_numeric_values, None);
430 }
431
432 #[test]
435 fn duplicates_config_toml_all_fields() {
436 let toml_str = r#"
437enabled = false
438mode = "weak"
439minTokens = 75
440minLines = 8
441threshold = 3.0
442ignore = ["vendor/**"]
443skipLocal = true
444crossLanguage = true
445ignoreImports = true
446
447[normalization]
448ignoreIdentifiers = true
449ignoreStringValues = true
450ignoreNumericValues = false
451"#;
452 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
453 assert!(!config.enabled);
454 assert_eq!(config.mode, DetectionMode::Weak);
455 assert_eq!(config.min_tokens, 75);
456 assert_eq!(config.min_lines, 8);
457 assert!((config.threshold - 3.0).abs() < f64::EPSILON);
458 assert_eq!(config.ignore, vec!["vendor/**"]);
459 assert!(config.skip_local);
460 assert!(config.cross_language);
461 assert!(config.ignore_imports);
462 assert_eq!(config.normalization.ignore_identifiers, Some(true));
463 assert_eq!(config.normalization.ignore_string_values, Some(true));
464 assert_eq!(config.normalization.ignore_numeric_values, Some(false));
465 }
466
467 #[test]
468 fn duplicates_config_toml_defaults() {
469 let toml_str = "";
470 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
471 assert!(config.enabled);
472 assert_eq!(config.mode, DetectionMode::Mild);
473 assert_eq!(config.min_tokens, 50);
474 assert_eq!(config.min_lines, 5);
475 }
476
477 #[test]
480 fn normalization_config_default_all_none() {
481 let config = NormalizationConfig::default();
482 assert!(config.ignore_identifiers.is_none());
483 assert!(config.ignore_string_values.is_none());
484 assert!(config.ignore_numeric_values.is_none());
485 }
486
487 #[test]
488 fn normalization_config_empty_json_object() {
489 let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
490 assert!(config.ignore_identifiers.is_none());
491 assert!(config.ignore_string_values.is_none());
492 assert!(config.ignore_numeric_values.is_none());
493 }
494
495 #[test]
498 fn detection_mode_default_is_mild() {
499 assert_eq!(DetectionMode::default(), DetectionMode::Mild);
500 }
501
502 #[test]
505 fn resolved_normalization_equality() {
506 let a = ResolvedNormalization {
507 ignore_identifiers: true,
508 ignore_string_values: false,
509 ignore_numeric_values: true,
510 };
511 let b = ResolvedNormalization {
512 ignore_identifiers: true,
513 ignore_string_values: false,
514 ignore_numeric_values: true,
515 };
516 assert_eq!(a, b);
517
518 let c = ResolvedNormalization {
519 ignore_identifiers: false,
520 ignore_string_values: false,
521 ignore_numeric_values: true,
522 };
523 assert_ne!(a, c);
524 }
525
526 #[test]
529 fn detection_mode_json_deserialization() {
530 let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
531 assert_eq!(strict, DetectionMode::Strict);
532
533 let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
534 assert_eq!(mild, DetectionMode::Mild);
535
536 let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
537 assert_eq!(weak, DetectionMode::Weak);
538
539 let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
540 assert_eq!(semantic, DetectionMode::Semantic);
541 }
542
543 #[test]
544 fn detection_mode_invalid_json() {
545 let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
546 assert!(result.is_err());
547 }
548
549 #[test]
552 fn duplicates_config_json_roundtrip() {
553 let config = DuplicatesConfig {
554 enabled: false,
555 mode: DetectionMode::Semantic,
556 min_tokens: 100,
557 min_lines: 10,
558 threshold: 5.5,
559 ignore: vec!["test/**".to_string()],
560 ignore_defaults: false,
561 skip_local: true,
562 cross_language: true,
563 ignore_imports: true,
564 normalization: NormalizationConfig {
565 ignore_identifiers: Some(true),
566 ignore_string_values: None,
567 ignore_numeric_values: Some(false),
568 },
569 min_corpus_size_for_shingle_filter: 2048,
570 min_corpus_size_for_token_cache: 8_000,
571 };
572 let json = serde_json::to_string(&config).unwrap();
573 let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
574 assert!(!restored.enabled);
575 assert_eq!(restored.mode, DetectionMode::Semantic);
576 assert_eq!(restored.min_tokens, 100);
577 assert_eq!(restored.min_lines, 10);
578 assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
579 assert!(!restored.ignore_defaults);
580 assert!(restored.skip_local);
581 assert!(restored.cross_language);
582 assert_eq!(restored.min_corpus_size_for_shingle_filter, 2048);
583 assert_eq!(restored.min_corpus_size_for_token_cache, 8_000);
584 assert!(restored.ignore_imports);
585 assert_eq!(restored.normalization.ignore_identifiers, Some(true));
586 assert!(restored.normalization.ignore_string_values.is_none());
587 assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
588 }
589
590 #[test]
593 fn normalization_none_fields_not_serialized() {
594 let config = NormalizationConfig::default();
595 let json = serde_json::to_string(&config).unwrap();
596 assert!(
597 !json.contains("ignoreIdentifiers"),
598 "None fields should be skipped"
599 );
600 assert!(
601 !json.contains("ignoreStringValues"),
602 "None fields should be skipped"
603 );
604 assert!(
605 !json.contains("ignoreNumericValues"),
606 "None fields should be skipped"
607 );
608 }
609
610 #[test]
611 fn normalization_some_fields_serialized() {
612 let config = NormalizationConfig {
613 ignore_identifiers: Some(true),
614 ignore_string_values: None,
615 ignore_numeric_values: Some(false),
616 };
617 let json = serde_json::to_string(&config).unwrap();
618 assert!(json.contains("ignoreIdentifiers"));
619 assert!(!json.contains("ignoreStringValues"));
620 assert!(json.contains("ignoreNumericValues"));
621 }
622}