Skip to main content

fallow_config/config/
duplicates_config.rs

1use schemars::JsonSchema;
2use serde::{Deserialize, Serialize};
3
4const fn default_true() -> bool {
5    true
6}
7
8const fn default_min_tokens() -> usize {
9    50
10}
11
12const fn default_min_lines() -> usize {
13    5
14}
15
16const fn default_min_corpus_size_for_shingle_filter() -> usize {
17    1024
18}
19
20const fn default_min_corpus_size_for_token_cache() -> usize {
21    5_000
22}
23
24/// Configuration for code duplication detection.
25#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
26#[serde(rename_all = "camelCase")]
27pub struct DuplicatesConfig {
28    /// Whether duplication detection is enabled.
29    #[serde(default = "default_true")]
30    pub enabled: bool,
31
32    /// Detection mode: strict, mild, weak, or semantic.
33    #[serde(default)]
34    pub mode: DetectionMode,
35
36    /// Minimum number of tokens for a clone.
37    #[serde(default = "default_min_tokens")]
38    pub min_tokens: usize,
39
40    /// Minimum number of lines for a clone.
41    #[serde(default = "default_min_lines")]
42    pub min_lines: usize,
43
44    /// Maximum allowed duplication percentage (0 = no limit).
45    #[serde(default)]
46    pub threshold: f64,
47
48    /// Additional ignore patterns for duplication analysis.
49    #[serde(default)]
50    pub ignore: Vec<String>,
51
52    /// Merge built-in generated-framework ignore patterns with `ignore`.
53    ///
54    /// Set to `false` to use only the user-provided `ignore` list.
55    #[serde(default = "default_true")]
56    pub ignore_defaults: bool,
57
58    /// Only report cross-directory duplicates.
59    #[serde(default)]
60    pub skip_local: bool,
61
62    /// Enable cross-language clone detection by stripping type annotations.
63    ///
64    /// When enabled, TypeScript type annotations (parameter types, return types,
65    /// generics, interfaces, type aliases) are stripped from the token stream,
66    /// allowing detection of clones between `.ts` and `.js` files.
67    #[serde(default)]
68    pub cross_language: bool,
69
70    /// Exclude ES `import` declarations from clone detection.
71    ///
72    /// When enabled, all `import` statements (value imports, type imports, and
73    /// side-effect imports) are stripped from the token stream before clone
74    /// detection. This reduces noise from sorted import blocks that naturally
75    /// look similar across files. Only affects ES `import` declarations;
76    /// CommonJS `require()` calls are not filtered.
77    #[serde(default)]
78    pub ignore_imports: bool,
79
80    /// Fine-grained normalization overrides on top of the detection mode.
81    #[serde(default)]
82    pub normalization: NormalizationConfig,
83
84    /// Minimum tokenized file count before focused duplicate analysis prefilters
85    /// unchanged files with k-token shingles.
86    #[serde(default = "default_min_corpus_size_for_shingle_filter")]
87    pub min_corpus_size_for_shingle_filter: usize,
88
89    /// Minimum source file count before the persistent duplication token cache
90    /// activates. Below this threshold the cache load/save overhead exceeds the
91    /// tokenize savings, so the cache stays disabled even when not running with
92    /// `--no-cache`.
93    #[serde(default = "default_min_corpus_size_for_token_cache")]
94    pub min_corpus_size_for_token_cache: usize,
95}
96
97impl Default for DuplicatesConfig {
98    fn default() -> Self {
99        Self {
100            enabled: true,
101            mode: DetectionMode::default(),
102            min_tokens: default_min_tokens(),
103            min_lines: default_min_lines(),
104            threshold: 0.0,
105            ignore: vec![],
106            ignore_defaults: true,
107            skip_local: false,
108            cross_language: false,
109            ignore_imports: false,
110            normalization: NormalizationConfig::default(),
111            min_corpus_size_for_shingle_filter: default_min_corpus_size_for_shingle_filter(),
112            min_corpus_size_for_token_cache: default_min_corpus_size_for_token_cache(),
113        }
114    }
115}
116
117/// Fine-grained normalization overrides.
118///
119/// Each option, when set to `Some(true)`, forces that normalization regardless of
120/// the detection mode. When set to `Some(false)`, it forces preservation. When
121/// `None`, the detection mode's default behavior applies.
122#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
123#[serde(rename_all = "camelCase")]
124pub struct NormalizationConfig {
125    /// Blind all identifiers (variable names, function names, etc.) to the same hash.
126    /// Default in `semantic` mode.
127    #[serde(default, skip_serializing_if = "Option::is_none")]
128    pub ignore_identifiers: Option<bool>,
129
130    /// Blind string literal values to the same hash.
131    /// Default in `weak` and `semantic` modes.
132    #[serde(default, skip_serializing_if = "Option::is_none")]
133    pub ignore_string_values: Option<bool>,
134
135    /// Blind numeric literal values to the same hash.
136    /// Default in `semantic` mode.
137    #[serde(default, skip_serializing_if = "Option::is_none")]
138    pub ignore_numeric_values: Option<bool>,
139}
140
141/// Resolved normalization flags: mode defaults merged with user overrides.
142#[derive(Debug, Clone, Copy, PartialEq, Eq)]
143pub struct ResolvedNormalization {
144    pub ignore_identifiers: bool,
145    pub ignore_string_values: bool,
146    pub ignore_numeric_values: bool,
147}
148
149impl ResolvedNormalization {
150    /// Resolve normalization from a detection mode and optional overrides.
151    #[must_use]
152    pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
153        let (default_ids, default_strings, default_numbers) = match mode {
154            DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
155            DetectionMode::Weak => (false, true, false),
156            DetectionMode::Semantic => (true, true, true),
157        };
158
159        Self {
160            ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
161            ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
162            ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
163        }
164    }
165}
166
167/// Detection mode controlling how aggressively tokens are normalized.
168///
169/// Since fallow uses AST-based tokenization (not lexer-based), whitespace and
170/// comments are inherently absent from the token stream. The `Strict` and `Mild`
171/// modes are currently equivalent. `Weak` mode additionally blinds string
172/// literals. `Semantic` mode blinds all identifiers and literal values for
173/// Type-2 (renamed variable) clone detection.
174#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
175#[serde(rename_all = "lowercase")]
176pub enum DetectionMode {
177    /// All tokens preserved including identifier names and literal values (Type-1 only).
178    Strict,
179    /// Default mode -- equivalent to strict for AST-based tokenization.
180    #[default]
181    Mild,
182    /// Blind string literal values (structure-preserving).
183    Weak,
184    /// Blind all identifiers and literal values for structural (Type-2) detection.
185    Semantic,
186}
187
188impl std::fmt::Display for DetectionMode {
189    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190        match self {
191            Self::Strict => write!(f, "strict"),
192            Self::Mild => write!(f, "mild"),
193            Self::Weak => write!(f, "weak"),
194            Self::Semantic => write!(f, "semantic"),
195        }
196    }
197}
198
199impl std::str::FromStr for DetectionMode {
200    type Err = String;
201
202    fn from_str(s: &str) -> Result<Self, Self::Err> {
203        match s.to_lowercase().as_str() {
204            "strict" => Ok(Self::Strict),
205            "mild" => Ok(Self::Mild),
206            "weak" => Ok(Self::Weak),
207            "semantic" => Ok(Self::Semantic),
208            other => Err(format!("unknown detection mode: '{other}'")),
209        }
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    // ── DuplicatesConfig defaults ────────────────────────────────────
218
219    #[test]
220    fn duplicates_config_defaults() {
221        let config = DuplicatesConfig::default();
222        assert!(config.enabled);
223        assert_eq!(config.mode, DetectionMode::Mild);
224        assert_eq!(config.min_tokens, 50);
225        assert_eq!(config.min_lines, 5);
226        assert!((config.threshold - 0.0).abs() < f64::EPSILON);
227        assert!(config.ignore.is_empty());
228        assert!(config.ignore_defaults);
229        assert!(!config.skip_local);
230        assert!(!config.cross_language);
231        assert!(!config.ignore_imports);
232        assert_eq!(config.min_corpus_size_for_shingle_filter, 1024);
233        assert_eq!(config.min_corpus_size_for_token_cache, 5_000);
234    }
235
236    // ── DetectionMode FromStr ────────────────────────────────────────
237
238    #[test]
239    fn detection_mode_from_str_all_variants() {
240        assert_eq!(
241            "strict".parse::<DetectionMode>().unwrap(),
242            DetectionMode::Strict
243        );
244        assert_eq!(
245            "mild".parse::<DetectionMode>().unwrap(),
246            DetectionMode::Mild
247        );
248        assert_eq!(
249            "weak".parse::<DetectionMode>().unwrap(),
250            DetectionMode::Weak
251        );
252        assert_eq!(
253            "semantic".parse::<DetectionMode>().unwrap(),
254            DetectionMode::Semantic
255        );
256    }
257
258    #[test]
259    fn detection_mode_from_str_case_insensitive() {
260        assert_eq!(
261            "STRICT".parse::<DetectionMode>().unwrap(),
262            DetectionMode::Strict
263        );
264        assert_eq!(
265            "Weak".parse::<DetectionMode>().unwrap(),
266            DetectionMode::Weak
267        );
268        assert_eq!(
269            "SEMANTIC".parse::<DetectionMode>().unwrap(),
270            DetectionMode::Semantic
271        );
272    }
273
274    #[test]
275    fn detection_mode_from_str_unknown() {
276        let err = "foobar".parse::<DetectionMode>().unwrap_err();
277        assert!(err.contains("unknown detection mode"));
278        assert!(err.contains("foobar"));
279    }
280
281    // ── DetectionMode Display ────────────────────────────────────────
282
283    #[test]
284    fn detection_mode_display() {
285        assert_eq!(DetectionMode::Strict.to_string(), "strict");
286        assert_eq!(DetectionMode::Mild.to_string(), "mild");
287        assert_eq!(DetectionMode::Weak.to_string(), "weak");
288        assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
289    }
290
291    // ── ResolvedNormalization::resolve ────────────────────────────────
292
293    #[test]
294    fn resolve_strict_mode_all_false() {
295        let resolved =
296            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
297        assert!(!resolved.ignore_identifiers);
298        assert!(!resolved.ignore_string_values);
299        assert!(!resolved.ignore_numeric_values);
300    }
301
302    #[test]
303    fn resolve_mild_mode_all_false() {
304        let resolved =
305            ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
306        assert!(!resolved.ignore_identifiers);
307        assert!(!resolved.ignore_string_values);
308        assert!(!resolved.ignore_numeric_values);
309    }
310
311    #[test]
312    fn resolve_weak_mode_only_strings_true() {
313        let resolved =
314            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
315        assert!(!resolved.ignore_identifiers);
316        assert!(resolved.ignore_string_values);
317        assert!(!resolved.ignore_numeric_values);
318    }
319
320    #[test]
321    fn resolve_semantic_mode_all_true() {
322        let resolved = ResolvedNormalization::resolve(
323            DetectionMode::Semantic,
324            &NormalizationConfig::default(),
325        );
326        assert!(resolved.ignore_identifiers);
327        assert!(resolved.ignore_string_values);
328        assert!(resolved.ignore_numeric_values);
329    }
330
331    #[test]
332    fn resolve_override_forces_true() {
333        // Strict mode defaults to all false, but override forces ignore_identifiers to true
334        let overrides = NormalizationConfig {
335            ignore_identifiers: Some(true),
336            ignore_string_values: None,
337            ignore_numeric_values: None,
338        };
339        let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
340        assert!(resolved.ignore_identifiers);
341        assert!(!resolved.ignore_string_values);
342        assert!(!resolved.ignore_numeric_values);
343    }
344
345    #[test]
346    fn resolve_override_forces_false() {
347        // Semantic mode defaults to all true, but override forces ignore_identifiers to false
348        let overrides = NormalizationConfig {
349            ignore_identifiers: Some(false),
350            ignore_string_values: Some(false),
351            ignore_numeric_values: None,
352        };
353        let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
354        assert!(!resolved.ignore_identifiers);
355        assert!(!resolved.ignore_string_values);
356        assert!(resolved.ignore_numeric_values); // not overridden
357    }
358
359    #[test]
360    fn resolve_all_overrides_on_weak() {
361        let overrides = NormalizationConfig {
362            ignore_identifiers: Some(true),
363            ignore_string_values: Some(false), // override weak default (true -> false)
364            ignore_numeric_values: Some(true),
365        };
366        let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
367        assert!(resolved.ignore_identifiers);
368        assert!(!resolved.ignore_string_values); // overridden from true to false
369        assert!(resolved.ignore_numeric_values);
370    }
371
372    // ── DuplicatesConfig deserialization ──────────────────────────────
373
374    #[test]
375    fn duplicates_config_json_all_fields() {
376        let json = r#"{
377            "enabled": false,
378            "mode": "semantic",
379            "minTokens": 100,
380            "minLines": 10,
381            "threshold": 5.0,
382            "ignore": ["**/vendor/**"],
383            "ignoreDefaults": false,
384            "skipLocal": true,
385            "crossLanguage": true,
386            "ignoreImports": true
387        }"#;
388        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
389        assert!(!config.enabled);
390        assert_eq!(config.mode, DetectionMode::Semantic);
391        assert_eq!(config.min_tokens, 100);
392        assert_eq!(config.min_lines, 10);
393        assert!((config.threshold - 5.0).abs() < f64::EPSILON);
394        assert_eq!(config.ignore, vec!["**/vendor/**"]);
395        assert!(!config.ignore_defaults);
396        assert!(config.skip_local);
397        assert!(config.cross_language);
398        assert!(config.ignore_imports);
399    }
400
401    #[test]
402    fn duplicates_config_json_partial_uses_defaults() {
403        let json = r#"{"mode": "weak"}"#;
404        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
405        assert!(config.enabled); // default
406        assert_eq!(config.mode, DetectionMode::Weak);
407        assert_eq!(config.min_tokens, 50); // default
408        assert_eq!(config.min_lines, 5); // default
409        assert!(config.ignore_defaults);
410    }
411
412    #[test]
413    fn duplicates_config_json_ignore_defaults_merges_by_default() {
414        let json = r#"{"ignore": ["**/foo/**"]}"#;
415        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
416        assert_eq!(config.ignore, vec!["**/foo/**"]);
417        assert!(config.ignore_defaults);
418    }
419
420    #[test]
421    fn normalization_config_json_overrides() {
422        let json = r#"{
423            "ignoreIdentifiers": true,
424            "ignoreStringValues": false
425        }"#;
426        let config: NormalizationConfig = serde_json::from_str(json).unwrap();
427        assert_eq!(config.ignore_identifiers, Some(true));
428        assert_eq!(config.ignore_string_values, Some(false));
429        assert_eq!(config.ignore_numeric_values, None);
430    }
431
432    // ── TOML deserialization ────────────────────────────────────────
433
434    #[test]
435    fn duplicates_config_toml_all_fields() {
436        let toml_str = r#"
437enabled = false
438mode = "weak"
439minTokens = 75
440minLines = 8
441threshold = 3.0
442ignore = ["vendor/**"]
443skipLocal = true
444crossLanguage = true
445ignoreImports = true
446
447[normalization]
448ignoreIdentifiers = true
449ignoreStringValues = true
450ignoreNumericValues = false
451"#;
452        let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
453        assert!(!config.enabled);
454        assert_eq!(config.mode, DetectionMode::Weak);
455        assert_eq!(config.min_tokens, 75);
456        assert_eq!(config.min_lines, 8);
457        assert!((config.threshold - 3.0).abs() < f64::EPSILON);
458        assert_eq!(config.ignore, vec!["vendor/**"]);
459        assert!(config.skip_local);
460        assert!(config.cross_language);
461        assert!(config.ignore_imports);
462        assert_eq!(config.normalization.ignore_identifiers, Some(true));
463        assert_eq!(config.normalization.ignore_string_values, Some(true));
464        assert_eq!(config.normalization.ignore_numeric_values, Some(false));
465    }
466
467    #[test]
468    fn duplicates_config_toml_defaults() {
469        let toml_str = "";
470        let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
471        assert!(config.enabled);
472        assert_eq!(config.mode, DetectionMode::Mild);
473        assert_eq!(config.min_tokens, 50);
474        assert_eq!(config.min_lines, 5);
475    }
476
477    // ── NormalizationConfig edge cases ──────────────────────────────
478
479    #[test]
480    fn normalization_config_default_all_none() {
481        let config = NormalizationConfig::default();
482        assert!(config.ignore_identifiers.is_none());
483        assert!(config.ignore_string_values.is_none());
484        assert!(config.ignore_numeric_values.is_none());
485    }
486
487    #[test]
488    fn normalization_config_empty_json_object() {
489        let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
490        assert!(config.ignore_identifiers.is_none());
491        assert!(config.ignore_string_values.is_none());
492        assert!(config.ignore_numeric_values.is_none());
493    }
494
495    // ── DetectionMode default ───────────────────────────────────────
496
497    #[test]
498    fn detection_mode_default_is_mild() {
499        assert_eq!(DetectionMode::default(), DetectionMode::Mild);
500    }
501
502    // ── ResolvedNormalization equality ───────────────────────────────
503
504    #[test]
505    fn resolved_normalization_equality() {
506        let a = ResolvedNormalization {
507            ignore_identifiers: true,
508            ignore_string_values: false,
509            ignore_numeric_values: true,
510        };
511        let b = ResolvedNormalization {
512            ignore_identifiers: true,
513            ignore_string_values: false,
514            ignore_numeric_values: true,
515        };
516        assert_eq!(a, b);
517
518        let c = ResolvedNormalization {
519            ignore_identifiers: false,
520            ignore_string_values: false,
521            ignore_numeric_values: true,
522        };
523        assert_ne!(a, c);
524    }
525
526    // ── Detection mode JSON deserialization ──────────────────────────
527
528    #[test]
529    fn detection_mode_json_deserialization() {
530        let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
531        assert_eq!(strict, DetectionMode::Strict);
532
533        let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
534        assert_eq!(mild, DetectionMode::Mild);
535
536        let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
537        assert_eq!(weak, DetectionMode::Weak);
538
539        let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
540        assert_eq!(semantic, DetectionMode::Semantic);
541    }
542
543    #[test]
544    fn detection_mode_invalid_json() {
545        let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
546        assert!(result.is_err());
547    }
548
549    // ── Serialize roundtrip ─────────────────────────────────────────
550
551    #[test]
552    fn duplicates_config_json_roundtrip() {
553        let config = DuplicatesConfig {
554            enabled: false,
555            mode: DetectionMode::Semantic,
556            min_tokens: 100,
557            min_lines: 10,
558            threshold: 5.5,
559            ignore: vec!["test/**".to_string()],
560            ignore_defaults: false,
561            skip_local: true,
562            cross_language: true,
563            ignore_imports: true,
564            normalization: NormalizationConfig {
565                ignore_identifiers: Some(true),
566                ignore_string_values: None,
567                ignore_numeric_values: Some(false),
568            },
569            min_corpus_size_for_shingle_filter: 2048,
570            min_corpus_size_for_token_cache: 8_000,
571        };
572        let json = serde_json::to_string(&config).unwrap();
573        let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
574        assert!(!restored.enabled);
575        assert_eq!(restored.mode, DetectionMode::Semantic);
576        assert_eq!(restored.min_tokens, 100);
577        assert_eq!(restored.min_lines, 10);
578        assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
579        assert!(!restored.ignore_defaults);
580        assert!(restored.skip_local);
581        assert!(restored.cross_language);
582        assert_eq!(restored.min_corpus_size_for_shingle_filter, 2048);
583        assert_eq!(restored.min_corpus_size_for_token_cache, 8_000);
584        assert!(restored.ignore_imports);
585        assert_eq!(restored.normalization.ignore_identifiers, Some(true));
586        assert!(restored.normalization.ignore_string_values.is_none());
587        assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
588    }
589
590    // ── NormalizationConfig skip_serializing_if ─────────────────────
591
592    #[test]
593    fn normalization_none_fields_not_serialized() {
594        let config = NormalizationConfig::default();
595        let json = serde_json::to_string(&config).unwrap();
596        assert!(
597            !json.contains("ignoreIdentifiers"),
598            "None fields should be skipped"
599        );
600        assert!(
601            !json.contains("ignoreStringValues"),
602            "None fields should be skipped"
603        );
604        assert!(
605            !json.contains("ignoreNumericValues"),
606            "None fields should be skipped"
607        );
608    }
609
610    #[test]
611    fn normalization_some_fields_serialized() {
612        let config = NormalizationConfig {
613            ignore_identifiers: Some(true),
614            ignore_string_values: None,
615            ignore_numeric_values: Some(false),
616        };
617        let json = serde_json::to_string(&config).unwrap();
618        assert!(json.contains("ignoreIdentifiers"));
619        assert!(!json.contains("ignoreStringValues"));
620        assert!(json.contains("ignoreNumericValues"));
621    }
622}