1use schemars::JsonSchema;
2use serde::{Deserialize, Deserializer, Serialize};
3
4const fn default_true() -> bool {
5 true
6}
7
8const fn default_min_tokens() -> usize {
9 50
10}
11
12const fn default_min_lines() -> usize {
13 5
14}
15
16const fn default_min_occurrences() -> usize {
17 2
18}
19
20fn deserialize_min_occurrences<'de, D>(deserializer: D) -> Result<usize, D::Error>
24where
25 D: Deserializer<'de>,
26{
27 let value = usize::deserialize(deserializer)?;
28 if value < 2 {
29 return Err(serde::de::Error::custom(format!(
30 "minOccurrences must be at least 2 (got {value}); a single occurrence isn't a duplicate"
31 )));
32 }
33 Ok(value)
34}
35
36const fn default_min_corpus_size_for_shingle_filter() -> usize {
37 1024
38}
39
40const fn default_min_corpus_size_for_token_cache() -> usize {
41 5_000
42}
43
44#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
46#[serde(rename_all = "camelCase")]
47pub struct DuplicatesConfig {
48 #[serde(default = "default_true")]
50 pub enabled: bool,
51
52 #[serde(default)]
54 pub mode: DetectionMode,
55
56 #[serde(default = "default_min_tokens")]
58 pub min_tokens: usize,
59
60 #[serde(default = "default_min_lines")]
62 pub min_lines: usize,
63
64 #[serde(
69 default = "default_min_occurrences",
70 deserialize_with = "deserialize_min_occurrences"
71 )]
72 #[schemars(range(min = 2))]
73 pub min_occurrences: usize,
74
75 #[serde(default)]
77 pub threshold: f64,
78
79 #[serde(default)]
81 pub ignore: Vec<String>,
82
83 #[serde(default = "default_true")]
87 pub ignore_defaults: bool,
88
89 #[serde(default)]
91 pub skip_local: bool,
92
93 #[serde(default)]
99 pub cross_language: bool,
100
101 #[serde(default = "default_true")]
112 pub ignore_imports: bool,
113
114 #[serde(default)]
116 pub normalization: NormalizationConfig,
117
118 #[serde(default = "default_min_corpus_size_for_shingle_filter")]
121 pub min_corpus_size_for_shingle_filter: usize,
122
123 #[serde(default = "default_min_corpus_size_for_token_cache")]
128 pub min_corpus_size_for_token_cache: usize,
129}
130
131impl Default for DuplicatesConfig {
132 fn default() -> Self {
133 Self {
134 enabled: true,
135 mode: DetectionMode::default(),
136 min_tokens: default_min_tokens(),
137 min_lines: default_min_lines(),
138 min_occurrences: default_min_occurrences(),
139 threshold: 0.0,
140 ignore: vec![],
141 ignore_defaults: true,
142 skip_local: false,
143 cross_language: false,
144 ignore_imports: true,
145 normalization: NormalizationConfig::default(),
146 min_corpus_size_for_shingle_filter: default_min_corpus_size_for_shingle_filter(),
147 min_corpus_size_for_token_cache: default_min_corpus_size_for_token_cache(),
148 }
149 }
150}
151
152#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
158#[serde(rename_all = "camelCase")]
159pub struct NormalizationConfig {
160 #[serde(default, skip_serializing_if = "Option::is_none")]
163 pub ignore_identifiers: Option<bool>,
164
165 #[serde(default, skip_serializing_if = "Option::is_none")]
168 pub ignore_string_values: Option<bool>,
169
170 #[serde(default, skip_serializing_if = "Option::is_none")]
173 pub ignore_numeric_values: Option<bool>,
174}
175
176#[derive(Debug, Clone, Copy, PartialEq, Eq)]
178pub struct ResolvedNormalization {
179 pub ignore_identifiers: bool,
180 pub ignore_string_values: bool,
181 pub ignore_numeric_values: bool,
182}
183
184impl ResolvedNormalization {
185 #[must_use]
187 pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
188 let (default_ids, default_strings, default_numbers) = match mode {
189 DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
190 DetectionMode::Weak => (false, true, false),
191 DetectionMode::Semantic => (true, true, true),
192 };
193
194 Self {
195 ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
196 ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
197 ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
198 }
199 }
200}
201
202#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
210#[serde(rename_all = "lowercase")]
211pub enum DetectionMode {
212 Strict,
214 #[default]
216 Mild,
217 Weak,
219 Semantic,
221}
222
223impl std::fmt::Display for DetectionMode {
224 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
225 match self {
226 Self::Strict => write!(f, "strict"),
227 Self::Mild => write!(f, "mild"),
228 Self::Weak => write!(f, "weak"),
229 Self::Semantic => write!(f, "semantic"),
230 }
231 }
232}
233
234impl std::str::FromStr for DetectionMode {
235 type Err = String;
236
237 fn from_str(s: &str) -> Result<Self, Self::Err> {
238 match s.to_lowercase().as_str() {
239 "strict" => Ok(Self::Strict),
240 "mild" => Ok(Self::Mild),
241 "weak" => Ok(Self::Weak),
242 "semantic" => Ok(Self::Semantic),
243 other => Err(format!("unknown detection mode: '{other}'")),
244 }
245 }
246}
247
248#[cfg(test)]
249mod tests {
250 use super::*;
251
252 #[test]
253 fn duplicates_config_defaults() {
254 let config = DuplicatesConfig::default();
255 assert!(config.enabled);
256 assert_eq!(config.mode, DetectionMode::Mild);
257 assert_eq!(config.min_tokens, 50);
258 assert_eq!(config.min_lines, 5);
259 assert_eq!(config.min_occurrences, 2);
260 assert!((config.threshold - 0.0).abs() < f64::EPSILON);
261 assert!(config.ignore.is_empty());
262 assert!(config.ignore_defaults);
263 assert!(!config.skip_local);
264 assert!(!config.cross_language);
265 assert!(config.ignore_imports);
266 assert_eq!(config.min_corpus_size_for_shingle_filter, 1024);
267 assert_eq!(config.min_corpus_size_for_token_cache, 5_000);
268 }
269
270 #[test]
271 fn detection_mode_from_str_all_variants() {
272 assert_eq!(
273 "strict".parse::<DetectionMode>().unwrap(),
274 DetectionMode::Strict
275 );
276 assert_eq!(
277 "mild".parse::<DetectionMode>().unwrap(),
278 DetectionMode::Mild
279 );
280 assert_eq!(
281 "weak".parse::<DetectionMode>().unwrap(),
282 DetectionMode::Weak
283 );
284 assert_eq!(
285 "semantic".parse::<DetectionMode>().unwrap(),
286 DetectionMode::Semantic
287 );
288 }
289
290 #[test]
291 fn detection_mode_from_str_case_insensitive() {
292 assert_eq!(
293 "STRICT".parse::<DetectionMode>().unwrap(),
294 DetectionMode::Strict
295 );
296 assert_eq!(
297 "Weak".parse::<DetectionMode>().unwrap(),
298 DetectionMode::Weak
299 );
300 assert_eq!(
301 "SEMANTIC".parse::<DetectionMode>().unwrap(),
302 DetectionMode::Semantic
303 );
304 }
305
306 #[test]
307 fn detection_mode_from_str_unknown() {
308 let err = "foobar".parse::<DetectionMode>().unwrap_err();
309 assert!(err.contains("unknown detection mode"));
310 assert!(err.contains("foobar"));
311 }
312
313 #[test]
314 fn detection_mode_display() {
315 assert_eq!(DetectionMode::Strict.to_string(), "strict");
316 assert_eq!(DetectionMode::Mild.to_string(), "mild");
317 assert_eq!(DetectionMode::Weak.to_string(), "weak");
318 assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
319 }
320
321 #[test]
322 fn resolve_strict_mode_all_false() {
323 let resolved =
324 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
325 assert!(!resolved.ignore_identifiers);
326 assert!(!resolved.ignore_string_values);
327 assert!(!resolved.ignore_numeric_values);
328 }
329
330 #[test]
331 fn resolve_mild_mode_all_false() {
332 let resolved =
333 ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
334 assert!(!resolved.ignore_identifiers);
335 assert!(!resolved.ignore_string_values);
336 assert!(!resolved.ignore_numeric_values);
337 }
338
339 #[test]
340 fn resolve_weak_mode_only_strings_true() {
341 let resolved =
342 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
343 assert!(!resolved.ignore_identifiers);
344 assert!(resolved.ignore_string_values);
345 assert!(!resolved.ignore_numeric_values);
346 }
347
348 #[test]
349 fn resolve_semantic_mode_all_true() {
350 let resolved = ResolvedNormalization::resolve(
351 DetectionMode::Semantic,
352 &NormalizationConfig::default(),
353 );
354 assert!(resolved.ignore_identifiers);
355 assert!(resolved.ignore_string_values);
356 assert!(resolved.ignore_numeric_values);
357 }
358
359 #[test]
360 fn resolve_override_forces_true() {
361 let overrides = NormalizationConfig {
362 ignore_identifiers: Some(true),
363 ignore_string_values: None,
364 ignore_numeric_values: None,
365 };
366 let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
367 assert!(resolved.ignore_identifiers);
368 assert!(!resolved.ignore_string_values);
369 assert!(!resolved.ignore_numeric_values);
370 }
371
372 #[test]
373 fn resolve_override_forces_false() {
374 let overrides = NormalizationConfig {
375 ignore_identifiers: Some(false),
376 ignore_string_values: Some(false),
377 ignore_numeric_values: None,
378 };
379 let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
380 assert!(!resolved.ignore_identifiers);
381 assert!(!resolved.ignore_string_values);
382 assert!(resolved.ignore_numeric_values); }
384
385 #[test]
386 fn resolve_all_overrides_on_weak() {
387 let overrides = NormalizationConfig {
388 ignore_identifiers: Some(true),
389 ignore_string_values: Some(false), ignore_numeric_values: Some(true),
391 };
392 let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
393 assert!(resolved.ignore_identifiers);
394 assert!(!resolved.ignore_string_values); assert!(resolved.ignore_numeric_values);
396 }
397
398 #[test]
399 fn duplicates_config_json_all_fields() {
400 let json = r#"{
401 "enabled": false,
402 "mode": "semantic",
403 "minTokens": 100,
404 "minLines": 10,
405 "minOccurrences": 3,
406 "threshold": 5.0,
407 "ignore": ["**/vendor/**"],
408 "ignoreDefaults": false,
409 "skipLocal": true,
410 "crossLanguage": true,
411 "ignoreImports": true
412 }"#;
413 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
414 assert!(!config.enabled);
415 assert_eq!(config.mode, DetectionMode::Semantic);
416 assert_eq!(config.min_tokens, 100);
417 assert_eq!(config.min_lines, 10);
418 assert_eq!(config.min_occurrences, 3);
419 assert!((config.threshold - 5.0).abs() < f64::EPSILON);
420 assert_eq!(config.ignore, vec!["**/vendor/**"]);
421 assert!(!config.ignore_defaults);
422 assert!(config.skip_local);
423 assert!(config.cross_language);
424 assert!(config.ignore_imports);
425 }
426
427 #[test]
428 fn duplicates_config_json_partial_uses_defaults() {
429 let json = r#"{"mode": "weak"}"#;
430 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
431 assert!(config.enabled); assert_eq!(config.mode, DetectionMode::Weak);
433 assert_eq!(config.min_tokens, 50); assert_eq!(config.min_lines, 5); assert!(config.ignore_defaults);
436 }
437
438 #[test]
439 fn duplicates_config_json_ignore_defaults_merges_by_default() {
440 let json = r#"{"ignore": ["**/foo/**"]}"#;
441 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
442 assert_eq!(config.ignore, vec!["**/foo/**"]);
443 assert!(config.ignore_defaults);
444 }
445
446 #[test]
447 fn ignore_imports_defaults_true_when_field_omitted() {
448 let empty: DuplicatesConfig = serde_json::from_str("{}").unwrap();
452 assert!(empty.ignore_imports);
453 let partial: DuplicatesConfig = serde_json::from_str(r#"{"minLines": 8}"#).unwrap();
454 assert!(partial.ignore_imports);
455 }
456
457 #[test]
458 fn ignore_imports_false_opts_out() {
459 let json: DuplicatesConfig = serde_json::from_str(r#"{"ignoreImports": false}"#).unwrap();
460 assert!(!json.ignore_imports);
461 let toml_cfg: DuplicatesConfig = toml::from_str("ignoreImports = false").unwrap();
462 assert!(!toml_cfg.ignore_imports);
463 }
464
465 #[test]
466 fn normalization_config_json_overrides() {
467 let json = r#"{
468 "ignoreIdentifiers": true,
469 "ignoreStringValues": false
470 }"#;
471 let config: NormalizationConfig = serde_json::from_str(json).unwrap();
472 assert_eq!(config.ignore_identifiers, Some(true));
473 assert_eq!(config.ignore_string_values, Some(false));
474 assert_eq!(config.ignore_numeric_values, None);
475 }
476
477 #[test]
478 fn duplicates_config_toml_all_fields() {
479 let toml_str = r#"
480enabled = false
481mode = "weak"
482minTokens = 75
483minLines = 8
484minOccurrences = 3
485threshold = 3.0
486ignore = ["vendor/**"]
487skipLocal = true
488crossLanguage = true
489ignoreImports = true
490
491[normalization]
492ignoreIdentifiers = true
493ignoreStringValues = true
494ignoreNumericValues = false
495"#;
496 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
497 assert!(!config.enabled);
498 assert_eq!(config.mode, DetectionMode::Weak);
499 assert_eq!(config.min_tokens, 75);
500 assert_eq!(config.min_lines, 8);
501 assert_eq!(config.min_occurrences, 3);
502 assert!((config.threshold - 3.0).abs() < f64::EPSILON);
503 assert_eq!(config.ignore, vec!["vendor/**"]);
504 assert!(config.skip_local);
505 assert!(config.cross_language);
506 assert!(config.ignore_imports);
507 assert_eq!(config.normalization.ignore_identifiers, Some(true));
508 assert_eq!(config.normalization.ignore_string_values, Some(true));
509 assert_eq!(config.normalization.ignore_numeric_values, Some(false));
510 }
511
512 #[test]
513 fn duplicates_config_toml_defaults() {
514 let toml_str = "";
515 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
516 assert!(config.enabled);
517 assert_eq!(config.mode, DetectionMode::Mild);
518 assert_eq!(config.min_tokens, 50);
519 assert_eq!(config.min_lines, 5);
520 }
521
522 #[test]
523 fn normalization_config_default_all_none() {
524 let config = NormalizationConfig::default();
525 assert!(config.ignore_identifiers.is_none());
526 assert!(config.ignore_string_values.is_none());
527 assert!(config.ignore_numeric_values.is_none());
528 }
529
530 #[test]
531 fn normalization_config_empty_json_object() {
532 let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
533 assert!(config.ignore_identifiers.is_none());
534 assert!(config.ignore_string_values.is_none());
535 assert!(config.ignore_numeric_values.is_none());
536 }
537
538 #[test]
539 fn detection_mode_default_is_mild() {
540 assert_eq!(DetectionMode::default(), DetectionMode::Mild);
541 }
542
543 #[test]
544 fn resolved_normalization_equality() {
545 let a = ResolvedNormalization {
546 ignore_identifiers: true,
547 ignore_string_values: false,
548 ignore_numeric_values: true,
549 };
550 let b = ResolvedNormalization {
551 ignore_identifiers: true,
552 ignore_string_values: false,
553 ignore_numeric_values: true,
554 };
555 assert_eq!(a, b);
556
557 let c = ResolvedNormalization {
558 ignore_identifiers: false,
559 ignore_string_values: false,
560 ignore_numeric_values: true,
561 };
562 assert_ne!(a, c);
563 }
564
565 #[test]
566 fn detection_mode_json_deserialization() {
567 let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
568 assert_eq!(strict, DetectionMode::Strict);
569
570 let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
571 assert_eq!(mild, DetectionMode::Mild);
572
573 let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
574 assert_eq!(weak, DetectionMode::Weak);
575
576 let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
577 assert_eq!(semantic, DetectionMode::Semantic);
578 }
579
580 #[test]
581 fn detection_mode_invalid_json() {
582 let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
583 assert!(result.is_err());
584 }
585
586 #[test]
587 fn duplicates_config_json_roundtrip() {
588 let config = DuplicatesConfig {
589 enabled: false,
590 mode: DetectionMode::Semantic,
591 min_tokens: 100,
592 min_lines: 10,
593 min_occurrences: 4,
594 threshold: 5.5,
595 ignore: vec!["test/**".to_string()],
596 ignore_defaults: false,
597 skip_local: true,
598 cross_language: true,
599 ignore_imports: true,
600 normalization: NormalizationConfig {
601 ignore_identifiers: Some(true),
602 ignore_string_values: None,
603 ignore_numeric_values: Some(false),
604 },
605 min_corpus_size_for_shingle_filter: 2048,
606 min_corpus_size_for_token_cache: 8_000,
607 };
608 let json = serde_json::to_string(&config).unwrap();
609 let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
610 assert!(!restored.enabled);
611 assert_eq!(restored.mode, DetectionMode::Semantic);
612 assert_eq!(restored.min_tokens, 100);
613 assert_eq!(restored.min_lines, 10);
614 assert_eq!(restored.min_occurrences, 4);
615 assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
616 assert!(!restored.ignore_defaults);
617 assert!(restored.skip_local);
618 assert!(restored.cross_language);
619 assert_eq!(restored.min_corpus_size_for_shingle_filter, 2048);
620 assert_eq!(restored.min_corpus_size_for_token_cache, 8_000);
621 assert!(restored.ignore_imports);
622 assert_eq!(restored.normalization.ignore_identifiers, Some(true));
623 assert!(restored.normalization.ignore_string_values.is_none());
624 assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
625 }
626
627 #[test]
628 fn normalization_none_fields_not_serialized() {
629 let config = NormalizationConfig::default();
630 let json = serde_json::to_string(&config).unwrap();
631 assert!(
632 !json.contains("ignoreIdentifiers"),
633 "None fields should be skipped"
634 );
635 assert!(
636 !json.contains("ignoreStringValues"),
637 "None fields should be skipped"
638 );
639 assert!(
640 !json.contains("ignoreNumericValues"),
641 "None fields should be skipped"
642 );
643 }
644
645 #[test]
646 fn normalization_some_fields_serialized() {
647 let config = NormalizationConfig {
648 ignore_identifiers: Some(true),
649 ignore_string_values: None,
650 ignore_numeric_values: Some(false),
651 };
652 let json = serde_json::to_string(&config).unwrap();
653 assert!(json.contains("ignoreIdentifiers"));
654 assert!(!json.contains("ignoreStringValues"));
655 assert!(json.contains("ignoreNumericValues"));
656 }
657
658 #[test]
659 fn min_occurrences_accepts_two_or_more() {
660 let json = r#"{"minOccurrences": 2}"#;
661 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
662 assert_eq!(config.min_occurrences, 2);
663
664 let json = r#"{"minOccurrences": 5}"#;
665 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
666 assert_eq!(config.min_occurrences, 5);
667 }
668
669 #[test]
670 fn min_occurrences_rejects_one() {
671 let json = r#"{"minOccurrences": 1}"#;
672 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
673 assert!(err.to_string().contains("at least 2"));
674 }
675
676 #[test]
677 fn min_occurrences_rejects_zero() {
678 let json = r#"{"minOccurrences": 0}"#;
679 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
680 assert!(err.to_string().contains("at least 2"));
681 }
682
683 #[test]
684 fn min_occurrences_rejects_one_in_toml() {
685 let toml_str = "minOccurrences = 1";
686 let err = toml::from_str::<DuplicatesConfig>(toml_str).unwrap_err();
687 assert!(err.to_string().contains("at least 2"));
688 }
689}