1use schemars::JsonSchema;
2use serde::{Deserialize, Deserializer, Serialize};
3
4const fn default_true() -> bool {
5 true
6}
7
8const fn default_min_tokens() -> usize {
9 50
10}
11
12const fn default_min_lines() -> usize {
13 5
14}
15
16const fn default_min_occurrences() -> usize {
17 2
18}
19
20fn deserialize_min_occurrences<'de, D>(deserializer: D) -> Result<usize, D::Error>
24where
25 D: Deserializer<'de>,
26{
27 let value = usize::deserialize(deserializer)?;
28 if value < 2 {
29 return Err(serde::de::Error::custom(format!(
30 "minOccurrences must be at least 2 (got {value}); a single occurrence isn't a duplicate"
31 )));
32 }
33 Ok(value)
34}
35
36const fn default_min_corpus_size_for_shingle_filter() -> usize {
37 1024
38}
39
40const fn default_min_corpus_size_for_token_cache() -> usize {
41 5_000
42}
43
44#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
46#[serde(rename_all = "camelCase")]
47pub struct DuplicatesConfig {
48 #[serde(default = "default_true")]
50 pub enabled: bool,
51
52 #[serde(default)]
54 pub mode: DetectionMode,
55
56 #[serde(default = "default_min_tokens")]
58 pub min_tokens: usize,
59
60 #[serde(default = "default_min_lines")]
62 pub min_lines: usize,
63
64 #[serde(
69 default = "default_min_occurrences",
70 deserialize_with = "deserialize_min_occurrences"
71 )]
72 #[schemars(range(min = 2))]
73 pub min_occurrences: usize,
74
75 #[serde(default)]
77 pub threshold: f64,
78
79 #[serde(default)]
81 pub ignore: Vec<String>,
82
83 #[serde(default = "default_true")]
87 pub ignore_defaults: bool,
88
89 #[serde(default)]
91 pub skip_local: bool,
92
93 #[serde(default)]
99 pub cross_language: bool,
100
101 #[serde(default)]
109 pub ignore_imports: bool,
110
111 #[serde(default)]
113 pub normalization: NormalizationConfig,
114
115 #[serde(default = "default_min_corpus_size_for_shingle_filter")]
118 pub min_corpus_size_for_shingle_filter: usize,
119
120 #[serde(default = "default_min_corpus_size_for_token_cache")]
125 pub min_corpus_size_for_token_cache: usize,
126}
127
128impl Default for DuplicatesConfig {
129 fn default() -> Self {
130 Self {
131 enabled: true,
132 mode: DetectionMode::default(),
133 min_tokens: default_min_tokens(),
134 min_lines: default_min_lines(),
135 min_occurrences: default_min_occurrences(),
136 threshold: 0.0,
137 ignore: vec![],
138 ignore_defaults: true,
139 skip_local: false,
140 cross_language: false,
141 ignore_imports: false,
142 normalization: NormalizationConfig::default(),
143 min_corpus_size_for_shingle_filter: default_min_corpus_size_for_shingle_filter(),
144 min_corpus_size_for_token_cache: default_min_corpus_size_for_token_cache(),
145 }
146 }
147}
148
149#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
155#[serde(rename_all = "camelCase")]
156pub struct NormalizationConfig {
157 #[serde(default, skip_serializing_if = "Option::is_none")]
160 pub ignore_identifiers: Option<bool>,
161
162 #[serde(default, skip_serializing_if = "Option::is_none")]
165 pub ignore_string_values: Option<bool>,
166
167 #[serde(default, skip_serializing_if = "Option::is_none")]
170 pub ignore_numeric_values: Option<bool>,
171}
172
173#[derive(Debug, Clone, Copy, PartialEq, Eq)]
175pub struct ResolvedNormalization {
176 pub ignore_identifiers: bool,
177 pub ignore_string_values: bool,
178 pub ignore_numeric_values: bool,
179}
180
181impl ResolvedNormalization {
182 #[must_use]
184 pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
185 let (default_ids, default_strings, default_numbers) = match mode {
186 DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
187 DetectionMode::Weak => (false, true, false),
188 DetectionMode::Semantic => (true, true, true),
189 };
190
191 Self {
192 ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
193 ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
194 ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
195 }
196 }
197}
198
199#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
207#[serde(rename_all = "lowercase")]
208pub enum DetectionMode {
209 Strict,
211 #[default]
213 Mild,
214 Weak,
216 Semantic,
218}
219
220impl std::fmt::Display for DetectionMode {
221 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
222 match self {
223 Self::Strict => write!(f, "strict"),
224 Self::Mild => write!(f, "mild"),
225 Self::Weak => write!(f, "weak"),
226 Self::Semantic => write!(f, "semantic"),
227 }
228 }
229}
230
231impl std::str::FromStr for DetectionMode {
232 type Err = String;
233
234 fn from_str(s: &str) -> Result<Self, Self::Err> {
235 match s.to_lowercase().as_str() {
236 "strict" => Ok(Self::Strict),
237 "mild" => Ok(Self::Mild),
238 "weak" => Ok(Self::Weak),
239 "semantic" => Ok(Self::Semantic),
240 other => Err(format!("unknown detection mode: '{other}'")),
241 }
242 }
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248
249 #[test]
252 fn duplicates_config_defaults() {
253 let config = DuplicatesConfig::default();
254 assert!(config.enabled);
255 assert_eq!(config.mode, DetectionMode::Mild);
256 assert_eq!(config.min_tokens, 50);
257 assert_eq!(config.min_lines, 5);
258 assert_eq!(config.min_occurrences, 2);
259 assert!((config.threshold - 0.0).abs() < f64::EPSILON);
260 assert!(config.ignore.is_empty());
261 assert!(config.ignore_defaults);
262 assert!(!config.skip_local);
263 assert!(!config.cross_language);
264 assert!(!config.ignore_imports);
265 assert_eq!(config.min_corpus_size_for_shingle_filter, 1024);
266 assert_eq!(config.min_corpus_size_for_token_cache, 5_000);
267 }
268
269 #[test]
272 fn detection_mode_from_str_all_variants() {
273 assert_eq!(
274 "strict".parse::<DetectionMode>().unwrap(),
275 DetectionMode::Strict
276 );
277 assert_eq!(
278 "mild".parse::<DetectionMode>().unwrap(),
279 DetectionMode::Mild
280 );
281 assert_eq!(
282 "weak".parse::<DetectionMode>().unwrap(),
283 DetectionMode::Weak
284 );
285 assert_eq!(
286 "semantic".parse::<DetectionMode>().unwrap(),
287 DetectionMode::Semantic
288 );
289 }
290
291 #[test]
292 fn detection_mode_from_str_case_insensitive() {
293 assert_eq!(
294 "STRICT".parse::<DetectionMode>().unwrap(),
295 DetectionMode::Strict
296 );
297 assert_eq!(
298 "Weak".parse::<DetectionMode>().unwrap(),
299 DetectionMode::Weak
300 );
301 assert_eq!(
302 "SEMANTIC".parse::<DetectionMode>().unwrap(),
303 DetectionMode::Semantic
304 );
305 }
306
307 #[test]
308 fn detection_mode_from_str_unknown() {
309 let err = "foobar".parse::<DetectionMode>().unwrap_err();
310 assert!(err.contains("unknown detection mode"));
311 assert!(err.contains("foobar"));
312 }
313
314 #[test]
317 fn detection_mode_display() {
318 assert_eq!(DetectionMode::Strict.to_string(), "strict");
319 assert_eq!(DetectionMode::Mild.to_string(), "mild");
320 assert_eq!(DetectionMode::Weak.to_string(), "weak");
321 assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
322 }
323
324 #[test]
327 fn resolve_strict_mode_all_false() {
328 let resolved =
329 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
330 assert!(!resolved.ignore_identifiers);
331 assert!(!resolved.ignore_string_values);
332 assert!(!resolved.ignore_numeric_values);
333 }
334
335 #[test]
336 fn resolve_mild_mode_all_false() {
337 let resolved =
338 ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
339 assert!(!resolved.ignore_identifiers);
340 assert!(!resolved.ignore_string_values);
341 assert!(!resolved.ignore_numeric_values);
342 }
343
344 #[test]
345 fn resolve_weak_mode_only_strings_true() {
346 let resolved =
347 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
348 assert!(!resolved.ignore_identifiers);
349 assert!(resolved.ignore_string_values);
350 assert!(!resolved.ignore_numeric_values);
351 }
352
353 #[test]
354 fn resolve_semantic_mode_all_true() {
355 let resolved = ResolvedNormalization::resolve(
356 DetectionMode::Semantic,
357 &NormalizationConfig::default(),
358 );
359 assert!(resolved.ignore_identifiers);
360 assert!(resolved.ignore_string_values);
361 assert!(resolved.ignore_numeric_values);
362 }
363
364 #[test]
365 fn resolve_override_forces_true() {
366 let overrides = NormalizationConfig {
368 ignore_identifiers: Some(true),
369 ignore_string_values: None,
370 ignore_numeric_values: None,
371 };
372 let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
373 assert!(resolved.ignore_identifiers);
374 assert!(!resolved.ignore_string_values);
375 assert!(!resolved.ignore_numeric_values);
376 }
377
378 #[test]
379 fn resolve_override_forces_false() {
380 let overrides = NormalizationConfig {
382 ignore_identifiers: Some(false),
383 ignore_string_values: Some(false),
384 ignore_numeric_values: None,
385 };
386 let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
387 assert!(!resolved.ignore_identifiers);
388 assert!(!resolved.ignore_string_values);
389 assert!(resolved.ignore_numeric_values); }
391
392 #[test]
393 fn resolve_all_overrides_on_weak() {
394 let overrides = NormalizationConfig {
395 ignore_identifiers: Some(true),
396 ignore_string_values: Some(false), ignore_numeric_values: Some(true),
398 };
399 let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
400 assert!(resolved.ignore_identifiers);
401 assert!(!resolved.ignore_string_values); assert!(resolved.ignore_numeric_values);
403 }
404
405 #[test]
408 fn duplicates_config_json_all_fields() {
409 let json = r#"{
410 "enabled": false,
411 "mode": "semantic",
412 "minTokens": 100,
413 "minLines": 10,
414 "minOccurrences": 3,
415 "threshold": 5.0,
416 "ignore": ["**/vendor/**"],
417 "ignoreDefaults": false,
418 "skipLocal": true,
419 "crossLanguage": true,
420 "ignoreImports": true
421 }"#;
422 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
423 assert!(!config.enabled);
424 assert_eq!(config.mode, DetectionMode::Semantic);
425 assert_eq!(config.min_tokens, 100);
426 assert_eq!(config.min_lines, 10);
427 assert_eq!(config.min_occurrences, 3);
428 assert!((config.threshold - 5.0).abs() < f64::EPSILON);
429 assert_eq!(config.ignore, vec!["**/vendor/**"]);
430 assert!(!config.ignore_defaults);
431 assert!(config.skip_local);
432 assert!(config.cross_language);
433 assert!(config.ignore_imports);
434 }
435
436 #[test]
437 fn duplicates_config_json_partial_uses_defaults() {
438 let json = r#"{"mode": "weak"}"#;
439 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
440 assert!(config.enabled); assert_eq!(config.mode, DetectionMode::Weak);
442 assert_eq!(config.min_tokens, 50); assert_eq!(config.min_lines, 5); assert!(config.ignore_defaults);
445 }
446
447 #[test]
448 fn duplicates_config_json_ignore_defaults_merges_by_default() {
449 let json = r#"{"ignore": ["**/foo/**"]}"#;
450 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
451 assert_eq!(config.ignore, vec!["**/foo/**"]);
452 assert!(config.ignore_defaults);
453 }
454
455 #[test]
456 fn normalization_config_json_overrides() {
457 let json = r#"{
458 "ignoreIdentifiers": true,
459 "ignoreStringValues": false
460 }"#;
461 let config: NormalizationConfig = serde_json::from_str(json).unwrap();
462 assert_eq!(config.ignore_identifiers, Some(true));
463 assert_eq!(config.ignore_string_values, Some(false));
464 assert_eq!(config.ignore_numeric_values, None);
465 }
466
467 #[test]
470 fn duplicates_config_toml_all_fields() {
471 let toml_str = r#"
472enabled = false
473mode = "weak"
474minTokens = 75
475minLines = 8
476minOccurrences = 3
477threshold = 3.0
478ignore = ["vendor/**"]
479skipLocal = true
480crossLanguage = true
481ignoreImports = true
482
483[normalization]
484ignoreIdentifiers = true
485ignoreStringValues = true
486ignoreNumericValues = false
487"#;
488 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
489 assert!(!config.enabled);
490 assert_eq!(config.mode, DetectionMode::Weak);
491 assert_eq!(config.min_tokens, 75);
492 assert_eq!(config.min_lines, 8);
493 assert_eq!(config.min_occurrences, 3);
494 assert!((config.threshold - 3.0).abs() < f64::EPSILON);
495 assert_eq!(config.ignore, vec!["vendor/**"]);
496 assert!(config.skip_local);
497 assert!(config.cross_language);
498 assert!(config.ignore_imports);
499 assert_eq!(config.normalization.ignore_identifiers, Some(true));
500 assert_eq!(config.normalization.ignore_string_values, Some(true));
501 assert_eq!(config.normalization.ignore_numeric_values, Some(false));
502 }
503
504 #[test]
505 fn duplicates_config_toml_defaults() {
506 let toml_str = "";
507 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
508 assert!(config.enabled);
509 assert_eq!(config.mode, DetectionMode::Mild);
510 assert_eq!(config.min_tokens, 50);
511 assert_eq!(config.min_lines, 5);
512 }
513
514 #[test]
517 fn normalization_config_default_all_none() {
518 let config = NormalizationConfig::default();
519 assert!(config.ignore_identifiers.is_none());
520 assert!(config.ignore_string_values.is_none());
521 assert!(config.ignore_numeric_values.is_none());
522 }
523
524 #[test]
525 fn normalization_config_empty_json_object() {
526 let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
527 assert!(config.ignore_identifiers.is_none());
528 assert!(config.ignore_string_values.is_none());
529 assert!(config.ignore_numeric_values.is_none());
530 }
531
532 #[test]
535 fn detection_mode_default_is_mild() {
536 assert_eq!(DetectionMode::default(), DetectionMode::Mild);
537 }
538
539 #[test]
542 fn resolved_normalization_equality() {
543 let a = ResolvedNormalization {
544 ignore_identifiers: true,
545 ignore_string_values: false,
546 ignore_numeric_values: true,
547 };
548 let b = ResolvedNormalization {
549 ignore_identifiers: true,
550 ignore_string_values: false,
551 ignore_numeric_values: true,
552 };
553 assert_eq!(a, b);
554
555 let c = ResolvedNormalization {
556 ignore_identifiers: false,
557 ignore_string_values: false,
558 ignore_numeric_values: true,
559 };
560 assert_ne!(a, c);
561 }
562
563 #[test]
566 fn detection_mode_json_deserialization() {
567 let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
568 assert_eq!(strict, DetectionMode::Strict);
569
570 let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
571 assert_eq!(mild, DetectionMode::Mild);
572
573 let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
574 assert_eq!(weak, DetectionMode::Weak);
575
576 let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
577 assert_eq!(semantic, DetectionMode::Semantic);
578 }
579
580 #[test]
581 fn detection_mode_invalid_json() {
582 let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
583 assert!(result.is_err());
584 }
585
586 #[test]
589 fn duplicates_config_json_roundtrip() {
590 let config = DuplicatesConfig {
591 enabled: false,
592 mode: DetectionMode::Semantic,
593 min_tokens: 100,
594 min_lines: 10,
595 min_occurrences: 4,
596 threshold: 5.5,
597 ignore: vec!["test/**".to_string()],
598 ignore_defaults: false,
599 skip_local: true,
600 cross_language: true,
601 ignore_imports: true,
602 normalization: NormalizationConfig {
603 ignore_identifiers: Some(true),
604 ignore_string_values: None,
605 ignore_numeric_values: Some(false),
606 },
607 min_corpus_size_for_shingle_filter: 2048,
608 min_corpus_size_for_token_cache: 8_000,
609 };
610 let json = serde_json::to_string(&config).unwrap();
611 let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
612 assert!(!restored.enabled);
613 assert_eq!(restored.mode, DetectionMode::Semantic);
614 assert_eq!(restored.min_tokens, 100);
615 assert_eq!(restored.min_lines, 10);
616 assert_eq!(restored.min_occurrences, 4);
617 assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
618 assert!(!restored.ignore_defaults);
619 assert!(restored.skip_local);
620 assert!(restored.cross_language);
621 assert_eq!(restored.min_corpus_size_for_shingle_filter, 2048);
622 assert_eq!(restored.min_corpus_size_for_token_cache, 8_000);
623 assert!(restored.ignore_imports);
624 assert_eq!(restored.normalization.ignore_identifiers, Some(true));
625 assert!(restored.normalization.ignore_string_values.is_none());
626 assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
627 }
628
629 #[test]
632 fn normalization_none_fields_not_serialized() {
633 let config = NormalizationConfig::default();
634 let json = serde_json::to_string(&config).unwrap();
635 assert!(
636 !json.contains("ignoreIdentifiers"),
637 "None fields should be skipped"
638 );
639 assert!(
640 !json.contains("ignoreStringValues"),
641 "None fields should be skipped"
642 );
643 assert!(
644 !json.contains("ignoreNumericValues"),
645 "None fields should be skipped"
646 );
647 }
648
649 #[test]
650 fn normalization_some_fields_serialized() {
651 let config = NormalizationConfig {
652 ignore_identifiers: Some(true),
653 ignore_string_values: None,
654 ignore_numeric_values: Some(false),
655 };
656 let json = serde_json::to_string(&config).unwrap();
657 assert!(json.contains("ignoreIdentifiers"));
658 assert!(!json.contains("ignoreStringValues"));
659 assert!(json.contains("ignoreNumericValues"));
660 }
661
662 #[test]
665 fn min_occurrences_accepts_two_or_more() {
666 let json = r#"{"minOccurrences": 2}"#;
667 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
668 assert_eq!(config.min_occurrences, 2);
669
670 let json = r#"{"minOccurrences": 5}"#;
671 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
672 assert_eq!(config.min_occurrences, 5);
673 }
674
675 #[test]
676 fn min_occurrences_rejects_one() {
677 let json = r#"{"minOccurrences": 1}"#;
678 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
679 assert!(err.to_string().contains("at least 2"));
680 }
681
682 #[test]
683 fn min_occurrences_rejects_zero() {
684 let json = r#"{"minOccurrences": 0}"#;
685 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
686 assert!(err.to_string().contains("at least 2"));
687 }
688
689 #[test]
690 fn min_occurrences_rejects_one_in_toml() {
691 let toml_str = "minOccurrences = 1";
692 let err = toml::from_str::<DuplicatesConfig>(toml_str).unwrap_err();
693 assert!(err.to_string().contains("at least 2"));
694 }
695}