1use schemars::JsonSchema;
2use serde::{Deserialize, Deserializer, Serialize};
3
4const fn default_true() -> bool {
5 true
6}
7
8const fn default_min_tokens() -> usize {
9 50
10}
11
12const fn default_min_lines() -> usize {
13 5
14}
15
16const fn default_min_occurrences() -> usize {
17 2
18}
19
20fn deserialize_min_occurrences<'de, D>(deserializer: D) -> Result<usize, D::Error>
24where
25 D: Deserializer<'de>,
26{
27 let value = usize::deserialize(deserializer)?;
28 if value < 2 {
29 return Err(serde::de::Error::custom(format!(
30 "minOccurrences must be at least 2 (got {value}); a single occurrence isn't a duplicate"
31 )));
32 }
33 Ok(value)
34}
35
36const fn default_min_corpus_size_for_shingle_filter() -> usize {
37 1024
38}
39
40const fn default_min_corpus_size_for_token_cache() -> usize {
41 5_000
42}
43
44#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
46#[serde(rename_all = "camelCase")]
47pub struct DuplicatesConfig {
48 #[serde(default = "default_true")]
50 pub enabled: bool,
51
52 #[serde(default)]
54 pub mode: DetectionMode,
55
56 #[serde(default = "default_min_tokens")]
58 pub min_tokens: usize,
59
60 #[serde(default = "default_min_lines")]
62 pub min_lines: usize,
63
64 #[serde(
69 default = "default_min_occurrences",
70 deserialize_with = "deserialize_min_occurrences"
71 )]
72 #[schemars(range(min = 2))]
73 pub min_occurrences: usize,
74
75 #[serde(default)]
77 pub threshold: f64,
78
79 #[serde(default)]
81 pub ignore: Vec<String>,
82
83 #[serde(default = "default_true")]
87 pub ignore_defaults: bool,
88
89 #[serde(default)]
91 pub skip_local: bool,
92
93 #[serde(default)]
99 pub cross_language: bool,
100
101 #[serde(default = "default_true")]
111 pub ignore_imports: bool,
112
113 #[serde(default)]
115 pub normalization: NormalizationConfig,
116
117 #[serde(default = "default_min_corpus_size_for_shingle_filter")]
120 pub min_corpus_size_for_shingle_filter: usize,
121
122 #[serde(default = "default_min_corpus_size_for_token_cache")]
127 pub min_corpus_size_for_token_cache: usize,
128}
129
130impl Default for DuplicatesConfig {
131 fn default() -> Self {
132 Self {
133 enabled: true,
134 mode: DetectionMode::default(),
135 min_tokens: default_min_tokens(),
136 min_lines: default_min_lines(),
137 min_occurrences: default_min_occurrences(),
138 threshold: 0.0,
139 ignore: vec![],
140 ignore_defaults: true,
141 skip_local: false,
142 cross_language: false,
143 ignore_imports: true,
144 normalization: NormalizationConfig::default(),
145 min_corpus_size_for_shingle_filter: default_min_corpus_size_for_shingle_filter(),
146 min_corpus_size_for_token_cache: default_min_corpus_size_for_token_cache(),
147 }
148 }
149}
150
151#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
157#[serde(rename_all = "camelCase")]
158pub struct NormalizationConfig {
159 #[serde(default, skip_serializing_if = "Option::is_none")]
162 pub ignore_identifiers: Option<bool>,
163
164 #[serde(default, skip_serializing_if = "Option::is_none")]
167 pub ignore_string_values: Option<bool>,
168
169 #[serde(default, skip_serializing_if = "Option::is_none")]
172 pub ignore_numeric_values: Option<bool>,
173}
174
175#[derive(Debug, Clone, Copy, PartialEq, Eq)]
177pub struct ResolvedNormalization {
178 pub ignore_identifiers: bool,
179 pub ignore_string_values: bool,
180 pub ignore_numeric_values: bool,
181}
182
183impl ResolvedNormalization {
184 #[must_use]
186 pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
187 let (default_ids, default_strings, default_numbers) = match mode {
188 DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
189 DetectionMode::Weak => (false, true, false),
190 DetectionMode::Semantic => (true, true, true),
191 };
192
193 Self {
194 ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
195 ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
196 ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
197 }
198 }
199}
200
201#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
209#[serde(rename_all = "lowercase")]
210pub enum DetectionMode {
211 Strict,
213 #[default]
215 Mild,
216 Weak,
218 Semantic,
220}
221
222impl std::fmt::Display for DetectionMode {
223 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
224 match self {
225 Self::Strict => write!(f, "strict"),
226 Self::Mild => write!(f, "mild"),
227 Self::Weak => write!(f, "weak"),
228 Self::Semantic => write!(f, "semantic"),
229 }
230 }
231}
232
233impl std::str::FromStr for DetectionMode {
234 type Err = String;
235
236 fn from_str(s: &str) -> Result<Self, Self::Err> {
237 match s.to_lowercase().as_str() {
238 "strict" => Ok(Self::Strict),
239 "mild" => Ok(Self::Mild),
240 "weak" => Ok(Self::Weak),
241 "semantic" => Ok(Self::Semantic),
242 other => Err(format!("unknown detection mode: '{other}'")),
243 }
244 }
245}
246
247#[cfg(test)]
248mod tests {
249 use super::*;
250
251 #[test]
252 fn duplicates_config_defaults() {
253 let config = DuplicatesConfig::default();
254 assert!(config.enabled);
255 assert_eq!(config.mode, DetectionMode::Mild);
256 assert_eq!(config.min_tokens, 50);
257 assert_eq!(config.min_lines, 5);
258 assert_eq!(config.min_occurrences, 2);
259 assert!((config.threshold - 0.0).abs() < f64::EPSILON);
260 assert!(config.ignore.is_empty());
261 assert!(config.ignore_defaults);
262 assert!(!config.skip_local);
263 assert!(!config.cross_language);
264 assert!(config.ignore_imports);
265 assert_eq!(config.min_corpus_size_for_shingle_filter, 1024);
266 assert_eq!(config.min_corpus_size_for_token_cache, 5_000);
267 }
268
269 #[test]
270 fn detection_mode_from_str_all_variants() {
271 assert_eq!(
272 "strict".parse::<DetectionMode>().unwrap(),
273 DetectionMode::Strict
274 );
275 assert_eq!(
276 "mild".parse::<DetectionMode>().unwrap(),
277 DetectionMode::Mild
278 );
279 assert_eq!(
280 "weak".parse::<DetectionMode>().unwrap(),
281 DetectionMode::Weak
282 );
283 assert_eq!(
284 "semantic".parse::<DetectionMode>().unwrap(),
285 DetectionMode::Semantic
286 );
287 }
288
289 #[test]
290 fn detection_mode_from_str_case_insensitive() {
291 assert_eq!(
292 "STRICT".parse::<DetectionMode>().unwrap(),
293 DetectionMode::Strict
294 );
295 assert_eq!(
296 "Weak".parse::<DetectionMode>().unwrap(),
297 DetectionMode::Weak
298 );
299 assert_eq!(
300 "SEMANTIC".parse::<DetectionMode>().unwrap(),
301 DetectionMode::Semantic
302 );
303 }
304
305 #[test]
306 fn detection_mode_from_str_unknown() {
307 let err = "foobar".parse::<DetectionMode>().unwrap_err();
308 assert!(err.contains("unknown detection mode"));
309 assert!(err.contains("foobar"));
310 }
311
312 #[test]
313 fn detection_mode_display() {
314 assert_eq!(DetectionMode::Strict.to_string(), "strict");
315 assert_eq!(DetectionMode::Mild.to_string(), "mild");
316 assert_eq!(DetectionMode::Weak.to_string(), "weak");
317 assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
318 }
319
320 #[test]
321 fn resolve_strict_mode_all_false() {
322 let resolved =
323 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
324 assert!(!resolved.ignore_identifiers);
325 assert!(!resolved.ignore_string_values);
326 assert!(!resolved.ignore_numeric_values);
327 }
328
329 #[test]
330 fn resolve_mild_mode_all_false() {
331 let resolved =
332 ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
333 assert!(!resolved.ignore_identifiers);
334 assert!(!resolved.ignore_string_values);
335 assert!(!resolved.ignore_numeric_values);
336 }
337
338 #[test]
339 fn resolve_weak_mode_only_strings_true() {
340 let resolved =
341 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
342 assert!(!resolved.ignore_identifiers);
343 assert!(resolved.ignore_string_values);
344 assert!(!resolved.ignore_numeric_values);
345 }
346
347 #[test]
348 fn resolve_semantic_mode_all_true() {
349 let resolved = ResolvedNormalization::resolve(
350 DetectionMode::Semantic,
351 &NormalizationConfig::default(),
352 );
353 assert!(resolved.ignore_identifiers);
354 assert!(resolved.ignore_string_values);
355 assert!(resolved.ignore_numeric_values);
356 }
357
358 #[test]
359 fn resolve_override_forces_true() {
360 let overrides = NormalizationConfig {
361 ignore_identifiers: Some(true),
362 ignore_string_values: None,
363 ignore_numeric_values: None,
364 };
365 let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
366 assert!(resolved.ignore_identifiers);
367 assert!(!resolved.ignore_string_values);
368 assert!(!resolved.ignore_numeric_values);
369 }
370
371 #[test]
372 fn resolve_override_forces_false() {
373 let overrides = NormalizationConfig {
374 ignore_identifiers: Some(false),
375 ignore_string_values: Some(false),
376 ignore_numeric_values: None,
377 };
378 let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
379 assert!(!resolved.ignore_identifiers);
380 assert!(!resolved.ignore_string_values);
381 assert!(resolved.ignore_numeric_values); }
383
384 #[test]
385 fn resolve_all_overrides_on_weak() {
386 let overrides = NormalizationConfig {
387 ignore_identifiers: Some(true),
388 ignore_string_values: Some(false), ignore_numeric_values: Some(true),
390 };
391 let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
392 assert!(resolved.ignore_identifiers);
393 assert!(!resolved.ignore_string_values); assert!(resolved.ignore_numeric_values);
395 }
396
397 #[test]
398 fn duplicates_config_json_all_fields() {
399 let json = r#"{
400 "enabled": false,
401 "mode": "semantic",
402 "minTokens": 100,
403 "minLines": 10,
404 "minOccurrences": 3,
405 "threshold": 5.0,
406 "ignore": ["**/vendor/**"],
407 "ignoreDefaults": false,
408 "skipLocal": true,
409 "crossLanguage": true,
410 "ignoreImports": true
411 }"#;
412 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
413 assert!(!config.enabled);
414 assert_eq!(config.mode, DetectionMode::Semantic);
415 assert_eq!(config.min_tokens, 100);
416 assert_eq!(config.min_lines, 10);
417 assert_eq!(config.min_occurrences, 3);
418 assert!((config.threshold - 5.0).abs() < f64::EPSILON);
419 assert_eq!(config.ignore, vec!["**/vendor/**"]);
420 assert!(!config.ignore_defaults);
421 assert!(config.skip_local);
422 assert!(config.cross_language);
423 assert!(config.ignore_imports);
424 }
425
426 #[test]
427 fn duplicates_config_json_partial_uses_defaults() {
428 let json = r#"{"mode": "weak"}"#;
429 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
430 assert!(config.enabled); assert_eq!(config.mode, DetectionMode::Weak);
432 assert_eq!(config.min_tokens, 50); assert_eq!(config.min_lines, 5); assert!(config.ignore_defaults);
435 }
436
437 #[test]
438 fn duplicates_config_json_ignore_defaults_merges_by_default() {
439 let json = r#"{"ignore": ["**/foo/**"]}"#;
440 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
441 assert_eq!(config.ignore, vec!["**/foo/**"]);
442 assert!(config.ignore_defaults);
443 }
444
445 #[test]
446 fn ignore_imports_defaults_true_when_field_omitted() {
447 let empty: DuplicatesConfig = serde_json::from_str("{}").unwrap();
451 assert!(empty.ignore_imports);
452 let partial: DuplicatesConfig = serde_json::from_str(r#"{"minLines": 8}"#).unwrap();
453 assert!(partial.ignore_imports);
454 }
455
456 #[test]
457 fn ignore_imports_false_opts_out() {
458 let json: DuplicatesConfig = serde_json::from_str(r#"{"ignoreImports": false}"#).unwrap();
459 assert!(!json.ignore_imports);
460 let toml_cfg: DuplicatesConfig = toml::from_str("ignoreImports = false").unwrap();
461 assert!(!toml_cfg.ignore_imports);
462 }
463
464 #[test]
465 fn normalization_config_json_overrides() {
466 let json = r#"{
467 "ignoreIdentifiers": true,
468 "ignoreStringValues": false
469 }"#;
470 let config: NormalizationConfig = serde_json::from_str(json).unwrap();
471 assert_eq!(config.ignore_identifiers, Some(true));
472 assert_eq!(config.ignore_string_values, Some(false));
473 assert_eq!(config.ignore_numeric_values, None);
474 }
475
476 #[test]
477 fn duplicates_config_toml_all_fields() {
478 let toml_str = r#"
479enabled = false
480mode = "weak"
481minTokens = 75
482minLines = 8
483minOccurrences = 3
484threshold = 3.0
485ignore = ["vendor/**"]
486skipLocal = true
487crossLanguage = true
488ignoreImports = true
489
490[normalization]
491ignoreIdentifiers = true
492ignoreStringValues = true
493ignoreNumericValues = false
494"#;
495 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
496 assert!(!config.enabled);
497 assert_eq!(config.mode, DetectionMode::Weak);
498 assert_eq!(config.min_tokens, 75);
499 assert_eq!(config.min_lines, 8);
500 assert_eq!(config.min_occurrences, 3);
501 assert!((config.threshold - 3.0).abs() < f64::EPSILON);
502 assert_eq!(config.ignore, vec!["vendor/**"]);
503 assert!(config.skip_local);
504 assert!(config.cross_language);
505 assert!(config.ignore_imports);
506 assert_eq!(config.normalization.ignore_identifiers, Some(true));
507 assert_eq!(config.normalization.ignore_string_values, Some(true));
508 assert_eq!(config.normalization.ignore_numeric_values, Some(false));
509 }
510
511 #[test]
512 fn duplicates_config_toml_defaults() {
513 let toml_str = "";
514 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
515 assert!(config.enabled);
516 assert_eq!(config.mode, DetectionMode::Mild);
517 assert_eq!(config.min_tokens, 50);
518 assert_eq!(config.min_lines, 5);
519 }
520
521 #[test]
522 fn normalization_config_default_all_none() {
523 let config = NormalizationConfig::default();
524 assert!(config.ignore_identifiers.is_none());
525 assert!(config.ignore_string_values.is_none());
526 assert!(config.ignore_numeric_values.is_none());
527 }
528
529 #[test]
530 fn normalization_config_empty_json_object() {
531 let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
532 assert!(config.ignore_identifiers.is_none());
533 assert!(config.ignore_string_values.is_none());
534 assert!(config.ignore_numeric_values.is_none());
535 }
536
537 #[test]
538 fn detection_mode_default_is_mild() {
539 assert_eq!(DetectionMode::default(), DetectionMode::Mild);
540 }
541
542 #[test]
543 fn resolved_normalization_equality() {
544 let a = ResolvedNormalization {
545 ignore_identifiers: true,
546 ignore_string_values: false,
547 ignore_numeric_values: true,
548 };
549 let b = ResolvedNormalization {
550 ignore_identifiers: true,
551 ignore_string_values: false,
552 ignore_numeric_values: true,
553 };
554 assert_eq!(a, b);
555
556 let c = ResolvedNormalization {
557 ignore_identifiers: false,
558 ignore_string_values: false,
559 ignore_numeric_values: true,
560 };
561 assert_ne!(a, c);
562 }
563
564 #[test]
565 fn detection_mode_json_deserialization() {
566 let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
567 assert_eq!(strict, DetectionMode::Strict);
568
569 let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
570 assert_eq!(mild, DetectionMode::Mild);
571
572 let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
573 assert_eq!(weak, DetectionMode::Weak);
574
575 let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
576 assert_eq!(semantic, DetectionMode::Semantic);
577 }
578
579 #[test]
580 fn detection_mode_invalid_json() {
581 let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
582 assert!(result.is_err());
583 }
584
585 #[test]
586 fn duplicates_config_json_roundtrip() {
587 let config = DuplicatesConfig {
588 enabled: false,
589 mode: DetectionMode::Semantic,
590 min_tokens: 100,
591 min_lines: 10,
592 min_occurrences: 4,
593 threshold: 5.5,
594 ignore: vec!["test/**".to_string()],
595 ignore_defaults: false,
596 skip_local: true,
597 cross_language: true,
598 ignore_imports: true,
599 normalization: NormalizationConfig {
600 ignore_identifiers: Some(true),
601 ignore_string_values: None,
602 ignore_numeric_values: Some(false),
603 },
604 min_corpus_size_for_shingle_filter: 2048,
605 min_corpus_size_for_token_cache: 8_000,
606 };
607 let json = serde_json::to_string(&config).unwrap();
608 let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
609 assert!(!restored.enabled);
610 assert_eq!(restored.mode, DetectionMode::Semantic);
611 assert_eq!(restored.min_tokens, 100);
612 assert_eq!(restored.min_lines, 10);
613 assert_eq!(restored.min_occurrences, 4);
614 assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
615 assert!(!restored.ignore_defaults);
616 assert!(restored.skip_local);
617 assert!(restored.cross_language);
618 assert_eq!(restored.min_corpus_size_for_shingle_filter, 2048);
619 assert_eq!(restored.min_corpus_size_for_token_cache, 8_000);
620 assert!(restored.ignore_imports);
621 assert_eq!(restored.normalization.ignore_identifiers, Some(true));
622 assert!(restored.normalization.ignore_string_values.is_none());
623 assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
624 }
625
626 #[test]
627 fn normalization_none_fields_not_serialized() {
628 let config = NormalizationConfig::default();
629 let json = serde_json::to_string(&config).unwrap();
630 assert!(
631 !json.contains("ignoreIdentifiers"),
632 "None fields should be skipped"
633 );
634 assert!(
635 !json.contains("ignoreStringValues"),
636 "None fields should be skipped"
637 );
638 assert!(
639 !json.contains("ignoreNumericValues"),
640 "None fields should be skipped"
641 );
642 }
643
644 #[test]
645 fn normalization_some_fields_serialized() {
646 let config = NormalizationConfig {
647 ignore_identifiers: Some(true),
648 ignore_string_values: None,
649 ignore_numeric_values: Some(false),
650 };
651 let json = serde_json::to_string(&config).unwrap();
652 assert!(json.contains("ignoreIdentifiers"));
653 assert!(!json.contains("ignoreStringValues"));
654 assert!(json.contains("ignoreNumericValues"));
655 }
656
657 #[test]
658 fn min_occurrences_accepts_two_or_more() {
659 let json = r#"{"minOccurrences": 2}"#;
660 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
661 assert_eq!(config.min_occurrences, 2);
662
663 let json = r#"{"minOccurrences": 5}"#;
664 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
665 assert_eq!(config.min_occurrences, 5);
666 }
667
668 #[test]
669 fn min_occurrences_rejects_one() {
670 let json = r#"{"minOccurrences": 1}"#;
671 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
672 assert!(err.to_string().contains("at least 2"));
673 }
674
675 #[test]
676 fn min_occurrences_rejects_zero() {
677 let json = r#"{"minOccurrences": 0}"#;
678 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
679 assert!(err.to_string().contains("at least 2"));
680 }
681
682 #[test]
683 fn min_occurrences_rejects_one_in_toml() {
684 let toml_str = "minOccurrences = 1";
685 let err = toml::from_str::<DuplicatesConfig>(toml_str).unwrap_err();
686 assert!(err.to_string().contains("at least 2"));
687 }
688}