1use schemars::JsonSchema;
2use serde::{Deserialize, Deserializer, Serialize};
3
4const fn default_true() -> bool {
5 true
6}
7
8const fn default_min_tokens() -> usize {
9 50
10}
11
12const fn default_min_lines() -> usize {
13 5
14}
15
16const fn default_min_occurrences() -> usize {
17 2
18}
19
20fn deserialize_min_occurrences<'de, D>(deserializer: D) -> Result<usize, D::Error>
24where
25 D: Deserializer<'de>,
26{
27 let value = usize::deserialize(deserializer)?;
28 if value < 2 {
29 return Err(serde::de::Error::custom(format!(
30 "minOccurrences must be at least 2 (got {value}); a single occurrence isn't a duplicate"
31 )));
32 }
33 Ok(value)
34}
35
36const fn default_min_corpus_size_for_shingle_filter() -> usize {
37 1024
38}
39
40const fn default_min_corpus_size_for_token_cache() -> usize {
41 5_000
42}
43
44#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
46#[serde(rename_all = "camelCase")]
47pub struct DuplicatesConfig {
48 #[serde(default = "default_true")]
50 pub enabled: bool,
51
52 #[serde(default)]
54 pub mode: DetectionMode,
55
56 #[serde(default = "default_min_tokens")]
58 pub min_tokens: usize,
59
60 #[serde(default = "default_min_lines")]
62 pub min_lines: usize,
63
64 #[serde(
69 default = "default_min_occurrences",
70 deserialize_with = "deserialize_min_occurrences"
71 )]
72 #[schemars(range(min = 2))]
73 pub min_occurrences: usize,
74
75 #[serde(default)]
77 pub threshold: f64,
78
79 #[serde(default)]
81 pub ignore: Vec<String>,
82
83 #[serde(default = "default_true")]
87 pub ignore_defaults: bool,
88
89 #[serde(default)]
91 pub skip_local: bool,
92
93 #[serde(default)]
99 pub cross_language: bool,
100
101 #[serde(default)]
109 pub ignore_imports: bool,
110
111 #[serde(default)]
113 pub normalization: NormalizationConfig,
114
115 #[serde(default = "default_min_corpus_size_for_shingle_filter")]
118 pub min_corpus_size_for_shingle_filter: usize,
119
120 #[serde(default = "default_min_corpus_size_for_token_cache")]
125 pub min_corpus_size_for_token_cache: usize,
126}
127
128impl Default for DuplicatesConfig {
129 fn default() -> Self {
130 Self {
131 enabled: true,
132 mode: DetectionMode::default(),
133 min_tokens: default_min_tokens(),
134 min_lines: default_min_lines(),
135 min_occurrences: default_min_occurrences(),
136 threshold: 0.0,
137 ignore: vec![],
138 ignore_defaults: true,
139 skip_local: false,
140 cross_language: false,
141 ignore_imports: false,
142 normalization: NormalizationConfig::default(),
143 min_corpus_size_for_shingle_filter: default_min_corpus_size_for_shingle_filter(),
144 min_corpus_size_for_token_cache: default_min_corpus_size_for_token_cache(),
145 }
146 }
147}
148
149#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
155#[serde(rename_all = "camelCase")]
156pub struct NormalizationConfig {
157 #[serde(default, skip_serializing_if = "Option::is_none")]
160 pub ignore_identifiers: Option<bool>,
161
162 #[serde(default, skip_serializing_if = "Option::is_none")]
165 pub ignore_string_values: Option<bool>,
166
167 #[serde(default, skip_serializing_if = "Option::is_none")]
170 pub ignore_numeric_values: Option<bool>,
171}
172
173#[derive(Debug, Clone, Copy, PartialEq, Eq)]
175pub struct ResolvedNormalization {
176 pub ignore_identifiers: bool,
177 pub ignore_string_values: bool,
178 pub ignore_numeric_values: bool,
179}
180
181impl ResolvedNormalization {
182 #[must_use]
184 pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
185 let (default_ids, default_strings, default_numbers) = match mode {
186 DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
187 DetectionMode::Weak => (false, true, false),
188 DetectionMode::Semantic => (true, true, true),
189 };
190
191 Self {
192 ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
193 ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
194 ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
195 }
196 }
197}
198
199#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
207#[serde(rename_all = "lowercase")]
208pub enum DetectionMode {
209 Strict,
211 #[default]
213 Mild,
214 Weak,
216 Semantic,
218}
219
220impl std::fmt::Display for DetectionMode {
221 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
222 match self {
223 Self::Strict => write!(f, "strict"),
224 Self::Mild => write!(f, "mild"),
225 Self::Weak => write!(f, "weak"),
226 Self::Semantic => write!(f, "semantic"),
227 }
228 }
229}
230
231impl std::str::FromStr for DetectionMode {
232 type Err = String;
233
234 fn from_str(s: &str) -> Result<Self, Self::Err> {
235 match s.to_lowercase().as_str() {
236 "strict" => Ok(Self::Strict),
237 "mild" => Ok(Self::Mild),
238 "weak" => Ok(Self::Weak),
239 "semantic" => Ok(Self::Semantic),
240 other => Err(format!("unknown detection mode: '{other}'")),
241 }
242 }
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248
249 #[test]
250 fn duplicates_config_defaults() {
251 let config = DuplicatesConfig::default();
252 assert!(config.enabled);
253 assert_eq!(config.mode, DetectionMode::Mild);
254 assert_eq!(config.min_tokens, 50);
255 assert_eq!(config.min_lines, 5);
256 assert_eq!(config.min_occurrences, 2);
257 assert!((config.threshold - 0.0).abs() < f64::EPSILON);
258 assert!(config.ignore.is_empty());
259 assert!(config.ignore_defaults);
260 assert!(!config.skip_local);
261 assert!(!config.cross_language);
262 assert!(!config.ignore_imports);
263 assert_eq!(config.min_corpus_size_for_shingle_filter, 1024);
264 assert_eq!(config.min_corpus_size_for_token_cache, 5_000);
265 }
266
267 #[test]
268 fn detection_mode_from_str_all_variants() {
269 assert_eq!(
270 "strict".parse::<DetectionMode>().unwrap(),
271 DetectionMode::Strict
272 );
273 assert_eq!(
274 "mild".parse::<DetectionMode>().unwrap(),
275 DetectionMode::Mild
276 );
277 assert_eq!(
278 "weak".parse::<DetectionMode>().unwrap(),
279 DetectionMode::Weak
280 );
281 assert_eq!(
282 "semantic".parse::<DetectionMode>().unwrap(),
283 DetectionMode::Semantic
284 );
285 }
286
287 #[test]
288 fn detection_mode_from_str_case_insensitive() {
289 assert_eq!(
290 "STRICT".parse::<DetectionMode>().unwrap(),
291 DetectionMode::Strict
292 );
293 assert_eq!(
294 "Weak".parse::<DetectionMode>().unwrap(),
295 DetectionMode::Weak
296 );
297 assert_eq!(
298 "SEMANTIC".parse::<DetectionMode>().unwrap(),
299 DetectionMode::Semantic
300 );
301 }
302
303 #[test]
304 fn detection_mode_from_str_unknown() {
305 let err = "foobar".parse::<DetectionMode>().unwrap_err();
306 assert!(err.contains("unknown detection mode"));
307 assert!(err.contains("foobar"));
308 }
309
310 #[test]
311 fn detection_mode_display() {
312 assert_eq!(DetectionMode::Strict.to_string(), "strict");
313 assert_eq!(DetectionMode::Mild.to_string(), "mild");
314 assert_eq!(DetectionMode::Weak.to_string(), "weak");
315 assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
316 }
317
318 #[test]
319 fn resolve_strict_mode_all_false() {
320 let resolved =
321 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
322 assert!(!resolved.ignore_identifiers);
323 assert!(!resolved.ignore_string_values);
324 assert!(!resolved.ignore_numeric_values);
325 }
326
327 #[test]
328 fn resolve_mild_mode_all_false() {
329 let resolved =
330 ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
331 assert!(!resolved.ignore_identifiers);
332 assert!(!resolved.ignore_string_values);
333 assert!(!resolved.ignore_numeric_values);
334 }
335
336 #[test]
337 fn resolve_weak_mode_only_strings_true() {
338 let resolved =
339 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
340 assert!(!resolved.ignore_identifiers);
341 assert!(resolved.ignore_string_values);
342 assert!(!resolved.ignore_numeric_values);
343 }
344
345 #[test]
346 fn resolve_semantic_mode_all_true() {
347 let resolved = ResolvedNormalization::resolve(
348 DetectionMode::Semantic,
349 &NormalizationConfig::default(),
350 );
351 assert!(resolved.ignore_identifiers);
352 assert!(resolved.ignore_string_values);
353 assert!(resolved.ignore_numeric_values);
354 }
355
356 #[test]
357 fn resolve_override_forces_true() {
358 let overrides = NormalizationConfig {
359 ignore_identifiers: Some(true),
360 ignore_string_values: None,
361 ignore_numeric_values: None,
362 };
363 let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
364 assert!(resolved.ignore_identifiers);
365 assert!(!resolved.ignore_string_values);
366 assert!(!resolved.ignore_numeric_values);
367 }
368
369 #[test]
370 fn resolve_override_forces_false() {
371 let overrides = NormalizationConfig {
372 ignore_identifiers: Some(false),
373 ignore_string_values: Some(false),
374 ignore_numeric_values: None,
375 };
376 let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
377 assert!(!resolved.ignore_identifiers);
378 assert!(!resolved.ignore_string_values);
379 assert!(resolved.ignore_numeric_values); }
381
382 #[test]
383 fn resolve_all_overrides_on_weak() {
384 let overrides = NormalizationConfig {
385 ignore_identifiers: Some(true),
386 ignore_string_values: Some(false), ignore_numeric_values: Some(true),
388 };
389 let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
390 assert!(resolved.ignore_identifiers);
391 assert!(!resolved.ignore_string_values); assert!(resolved.ignore_numeric_values);
393 }
394
395 #[test]
396 fn duplicates_config_json_all_fields() {
397 let json = r#"{
398 "enabled": false,
399 "mode": "semantic",
400 "minTokens": 100,
401 "minLines": 10,
402 "minOccurrences": 3,
403 "threshold": 5.0,
404 "ignore": ["**/vendor/**"],
405 "ignoreDefaults": false,
406 "skipLocal": true,
407 "crossLanguage": true,
408 "ignoreImports": true
409 }"#;
410 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
411 assert!(!config.enabled);
412 assert_eq!(config.mode, DetectionMode::Semantic);
413 assert_eq!(config.min_tokens, 100);
414 assert_eq!(config.min_lines, 10);
415 assert_eq!(config.min_occurrences, 3);
416 assert!((config.threshold - 5.0).abs() < f64::EPSILON);
417 assert_eq!(config.ignore, vec!["**/vendor/**"]);
418 assert!(!config.ignore_defaults);
419 assert!(config.skip_local);
420 assert!(config.cross_language);
421 assert!(config.ignore_imports);
422 }
423
424 #[test]
425 fn duplicates_config_json_partial_uses_defaults() {
426 let json = r#"{"mode": "weak"}"#;
427 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
428 assert!(config.enabled); assert_eq!(config.mode, DetectionMode::Weak);
430 assert_eq!(config.min_tokens, 50); assert_eq!(config.min_lines, 5); assert!(config.ignore_defaults);
433 }
434
435 #[test]
436 fn duplicates_config_json_ignore_defaults_merges_by_default() {
437 let json = r#"{"ignore": ["**/foo/**"]}"#;
438 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
439 assert_eq!(config.ignore, vec!["**/foo/**"]);
440 assert!(config.ignore_defaults);
441 }
442
443 #[test]
444 fn normalization_config_json_overrides() {
445 let json = r#"{
446 "ignoreIdentifiers": true,
447 "ignoreStringValues": false
448 }"#;
449 let config: NormalizationConfig = serde_json::from_str(json).unwrap();
450 assert_eq!(config.ignore_identifiers, Some(true));
451 assert_eq!(config.ignore_string_values, Some(false));
452 assert_eq!(config.ignore_numeric_values, None);
453 }
454
455 #[test]
456 fn duplicates_config_toml_all_fields() {
457 let toml_str = r#"
458enabled = false
459mode = "weak"
460minTokens = 75
461minLines = 8
462minOccurrences = 3
463threshold = 3.0
464ignore = ["vendor/**"]
465skipLocal = true
466crossLanguage = true
467ignoreImports = true
468
469[normalization]
470ignoreIdentifiers = true
471ignoreStringValues = true
472ignoreNumericValues = false
473"#;
474 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
475 assert!(!config.enabled);
476 assert_eq!(config.mode, DetectionMode::Weak);
477 assert_eq!(config.min_tokens, 75);
478 assert_eq!(config.min_lines, 8);
479 assert_eq!(config.min_occurrences, 3);
480 assert!((config.threshold - 3.0).abs() < f64::EPSILON);
481 assert_eq!(config.ignore, vec!["vendor/**"]);
482 assert!(config.skip_local);
483 assert!(config.cross_language);
484 assert!(config.ignore_imports);
485 assert_eq!(config.normalization.ignore_identifiers, Some(true));
486 assert_eq!(config.normalization.ignore_string_values, Some(true));
487 assert_eq!(config.normalization.ignore_numeric_values, Some(false));
488 }
489
490 #[test]
491 fn duplicates_config_toml_defaults() {
492 let toml_str = "";
493 let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
494 assert!(config.enabled);
495 assert_eq!(config.mode, DetectionMode::Mild);
496 assert_eq!(config.min_tokens, 50);
497 assert_eq!(config.min_lines, 5);
498 }
499
500 #[test]
501 fn normalization_config_default_all_none() {
502 let config = NormalizationConfig::default();
503 assert!(config.ignore_identifiers.is_none());
504 assert!(config.ignore_string_values.is_none());
505 assert!(config.ignore_numeric_values.is_none());
506 }
507
508 #[test]
509 fn normalization_config_empty_json_object() {
510 let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
511 assert!(config.ignore_identifiers.is_none());
512 assert!(config.ignore_string_values.is_none());
513 assert!(config.ignore_numeric_values.is_none());
514 }
515
516 #[test]
517 fn detection_mode_default_is_mild() {
518 assert_eq!(DetectionMode::default(), DetectionMode::Mild);
519 }
520
521 #[test]
522 fn resolved_normalization_equality() {
523 let a = ResolvedNormalization {
524 ignore_identifiers: true,
525 ignore_string_values: false,
526 ignore_numeric_values: true,
527 };
528 let b = ResolvedNormalization {
529 ignore_identifiers: true,
530 ignore_string_values: false,
531 ignore_numeric_values: true,
532 };
533 assert_eq!(a, b);
534
535 let c = ResolvedNormalization {
536 ignore_identifiers: false,
537 ignore_string_values: false,
538 ignore_numeric_values: true,
539 };
540 assert_ne!(a, c);
541 }
542
543 #[test]
544 fn detection_mode_json_deserialization() {
545 let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
546 assert_eq!(strict, DetectionMode::Strict);
547
548 let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
549 assert_eq!(mild, DetectionMode::Mild);
550
551 let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
552 assert_eq!(weak, DetectionMode::Weak);
553
554 let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
555 assert_eq!(semantic, DetectionMode::Semantic);
556 }
557
558 #[test]
559 fn detection_mode_invalid_json() {
560 let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
561 assert!(result.is_err());
562 }
563
564 #[test]
565 fn duplicates_config_json_roundtrip() {
566 let config = DuplicatesConfig {
567 enabled: false,
568 mode: DetectionMode::Semantic,
569 min_tokens: 100,
570 min_lines: 10,
571 min_occurrences: 4,
572 threshold: 5.5,
573 ignore: vec!["test/**".to_string()],
574 ignore_defaults: false,
575 skip_local: true,
576 cross_language: true,
577 ignore_imports: true,
578 normalization: NormalizationConfig {
579 ignore_identifiers: Some(true),
580 ignore_string_values: None,
581 ignore_numeric_values: Some(false),
582 },
583 min_corpus_size_for_shingle_filter: 2048,
584 min_corpus_size_for_token_cache: 8_000,
585 };
586 let json = serde_json::to_string(&config).unwrap();
587 let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
588 assert!(!restored.enabled);
589 assert_eq!(restored.mode, DetectionMode::Semantic);
590 assert_eq!(restored.min_tokens, 100);
591 assert_eq!(restored.min_lines, 10);
592 assert_eq!(restored.min_occurrences, 4);
593 assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
594 assert!(!restored.ignore_defaults);
595 assert!(restored.skip_local);
596 assert!(restored.cross_language);
597 assert_eq!(restored.min_corpus_size_for_shingle_filter, 2048);
598 assert_eq!(restored.min_corpus_size_for_token_cache, 8_000);
599 assert!(restored.ignore_imports);
600 assert_eq!(restored.normalization.ignore_identifiers, Some(true));
601 assert!(restored.normalization.ignore_string_values.is_none());
602 assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
603 }
604
605 #[test]
606 fn normalization_none_fields_not_serialized() {
607 let config = NormalizationConfig::default();
608 let json = serde_json::to_string(&config).unwrap();
609 assert!(
610 !json.contains("ignoreIdentifiers"),
611 "None fields should be skipped"
612 );
613 assert!(
614 !json.contains("ignoreStringValues"),
615 "None fields should be skipped"
616 );
617 assert!(
618 !json.contains("ignoreNumericValues"),
619 "None fields should be skipped"
620 );
621 }
622
623 #[test]
624 fn normalization_some_fields_serialized() {
625 let config = NormalizationConfig {
626 ignore_identifiers: Some(true),
627 ignore_string_values: None,
628 ignore_numeric_values: Some(false),
629 };
630 let json = serde_json::to_string(&config).unwrap();
631 assert!(json.contains("ignoreIdentifiers"));
632 assert!(!json.contains("ignoreStringValues"));
633 assert!(json.contains("ignoreNumericValues"));
634 }
635
636 #[test]
637 fn min_occurrences_accepts_two_or_more() {
638 let json = r#"{"minOccurrences": 2}"#;
639 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
640 assert_eq!(config.min_occurrences, 2);
641
642 let json = r#"{"minOccurrences": 5}"#;
643 let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
644 assert_eq!(config.min_occurrences, 5);
645 }
646
647 #[test]
648 fn min_occurrences_rejects_one() {
649 let json = r#"{"minOccurrences": 1}"#;
650 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
651 assert!(err.to_string().contains("at least 2"));
652 }
653
654 #[test]
655 fn min_occurrences_rejects_zero() {
656 let json = r#"{"minOccurrences": 0}"#;
657 let err = serde_json::from_str::<DuplicatesConfig>(json).unwrap_err();
658 assert!(err.to_string().contains("at least 2"));
659 }
660
661 #[test]
662 fn min_occurrences_rejects_one_in_toml() {
663 let toml_str = "minOccurrences = 1";
664 let err = toml::from_str::<DuplicatesConfig>(toml_str).unwrap_err();
665 assert!(err.to_string().contains("at least 2"));
666 }
667}