Skip to main content

fallow_config/config/
duplicates_config.rs

1use schemars::JsonSchema;
2use serde::{Deserialize, Serialize};
3
4const fn default_true() -> bool {
5    true
6}
7
8const fn default_min_tokens() -> usize {
9    50
10}
11
12const fn default_min_lines() -> usize {
13    5
14}
15
16/// Configuration for code duplication detection.
17#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
18#[serde(rename_all = "camelCase")]
19pub struct DuplicatesConfig {
20    /// Whether duplication detection is enabled.
21    #[serde(default = "default_true")]
22    pub enabled: bool,
23
24    /// Detection mode: strict, mild, weak, or semantic.
25    #[serde(default)]
26    pub mode: DetectionMode,
27
28    /// Minimum number of tokens for a clone.
29    #[serde(default = "default_min_tokens")]
30    pub min_tokens: usize,
31
32    /// Minimum number of lines for a clone.
33    #[serde(default = "default_min_lines")]
34    pub min_lines: usize,
35
36    /// Maximum allowed duplication percentage (0 = no limit).
37    #[serde(default)]
38    pub threshold: f64,
39
40    /// Additional ignore patterns for duplication analysis.
41    #[serde(default)]
42    pub ignore: Vec<String>,
43
44    /// Only report cross-directory duplicates.
45    #[serde(default)]
46    pub skip_local: bool,
47
48    /// Enable cross-language clone detection by stripping type annotations.
49    ///
50    /// When enabled, TypeScript type annotations (parameter types, return types,
51    /// generics, interfaces, type aliases) are stripped from the token stream,
52    /// allowing detection of clones between `.ts` and `.js` files.
53    #[serde(default)]
54    pub cross_language: bool,
55
56    /// Exclude ES `import` declarations from clone detection.
57    ///
58    /// When enabled, all `import` statements (value imports, type imports, and
59    /// side-effect imports) are stripped from the token stream before clone
60    /// detection. This reduces noise from sorted import blocks that naturally
61    /// look similar across files. Only affects ES `import` declarations;
62    /// CommonJS `require()` calls are not filtered.
63    #[serde(default)]
64    pub ignore_imports: bool,
65
66    /// Fine-grained normalization overrides on top of the detection mode.
67    #[serde(default)]
68    pub normalization: NormalizationConfig,
69}
70
71impl Default for DuplicatesConfig {
72    fn default() -> Self {
73        Self {
74            enabled: true,
75            mode: DetectionMode::default(),
76            min_tokens: default_min_tokens(),
77            min_lines: default_min_lines(),
78            threshold: 0.0,
79            ignore: vec![],
80            skip_local: false,
81            cross_language: false,
82            ignore_imports: false,
83            normalization: NormalizationConfig::default(),
84        }
85    }
86}
87
88/// Fine-grained normalization overrides.
89///
90/// Each option, when set to `Some(true)`, forces that normalization regardless of
91/// the detection mode. When set to `Some(false)`, it forces preservation. When
92/// `None`, the detection mode's default behavior applies.
93#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
94#[serde(rename_all = "camelCase")]
95pub struct NormalizationConfig {
96    /// Blind all identifiers (variable names, function names, etc.) to the same hash.
97    /// Default in `semantic` mode.
98    #[serde(default, skip_serializing_if = "Option::is_none")]
99    pub ignore_identifiers: Option<bool>,
100
101    /// Blind string literal values to the same hash.
102    /// Default in `weak` and `semantic` modes.
103    #[serde(default, skip_serializing_if = "Option::is_none")]
104    pub ignore_string_values: Option<bool>,
105
106    /// Blind numeric literal values to the same hash.
107    /// Default in `semantic` mode.
108    #[serde(default, skip_serializing_if = "Option::is_none")]
109    pub ignore_numeric_values: Option<bool>,
110}
111
112/// Resolved normalization flags: mode defaults merged with user overrides.
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114pub struct ResolvedNormalization {
115    pub ignore_identifiers: bool,
116    pub ignore_string_values: bool,
117    pub ignore_numeric_values: bool,
118}
119
120impl ResolvedNormalization {
121    /// Resolve normalization from a detection mode and optional overrides.
122    #[must_use]
123    pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
124        let (default_ids, default_strings, default_numbers) = match mode {
125            DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
126            DetectionMode::Weak => (false, true, false),
127            DetectionMode::Semantic => (true, true, true),
128        };
129
130        Self {
131            ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
132            ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
133            ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
134        }
135    }
136}
137
138/// Detection mode controlling how aggressively tokens are normalized.
139///
140/// Since fallow uses AST-based tokenization (not lexer-based), whitespace and
141/// comments are inherently absent from the token stream. The `Strict` and `Mild`
142/// modes are currently equivalent. `Weak` mode additionally blinds string
143/// literals. `Semantic` mode blinds all identifiers and literal values for
144/// Type-2 (renamed variable) clone detection.
145#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
146#[serde(rename_all = "lowercase")]
147pub enum DetectionMode {
148    /// All tokens preserved including identifier names and literal values (Type-1 only).
149    Strict,
150    /// Default mode -- equivalent to strict for AST-based tokenization.
151    #[default]
152    Mild,
153    /// Blind string literal values (structure-preserving).
154    Weak,
155    /// Blind all identifiers and literal values for structural (Type-2) detection.
156    Semantic,
157}
158
159impl std::fmt::Display for DetectionMode {
160    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
161        match self {
162            Self::Strict => write!(f, "strict"),
163            Self::Mild => write!(f, "mild"),
164            Self::Weak => write!(f, "weak"),
165            Self::Semantic => write!(f, "semantic"),
166        }
167    }
168}
169
170impl std::str::FromStr for DetectionMode {
171    type Err = String;
172
173    fn from_str(s: &str) -> Result<Self, Self::Err> {
174        match s.to_lowercase().as_str() {
175            "strict" => Ok(Self::Strict),
176            "mild" => Ok(Self::Mild),
177            "weak" => Ok(Self::Weak),
178            "semantic" => Ok(Self::Semantic),
179            other => Err(format!("unknown detection mode: '{other}'")),
180        }
181    }
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    // ── DuplicatesConfig defaults ────────────────────────────────────
189
190    #[test]
191    fn duplicates_config_defaults() {
192        let config = DuplicatesConfig::default();
193        assert!(config.enabled);
194        assert_eq!(config.mode, DetectionMode::Mild);
195        assert_eq!(config.min_tokens, 50);
196        assert_eq!(config.min_lines, 5);
197        assert!((config.threshold - 0.0).abs() < f64::EPSILON);
198        assert!(config.ignore.is_empty());
199        assert!(!config.skip_local);
200        assert!(!config.cross_language);
201        assert!(!config.ignore_imports);
202    }
203
204    // ── DetectionMode FromStr ────────────────────────────────────────
205
206    #[test]
207    fn detection_mode_from_str_all_variants() {
208        assert_eq!(
209            "strict".parse::<DetectionMode>().unwrap(),
210            DetectionMode::Strict
211        );
212        assert_eq!(
213            "mild".parse::<DetectionMode>().unwrap(),
214            DetectionMode::Mild
215        );
216        assert_eq!(
217            "weak".parse::<DetectionMode>().unwrap(),
218            DetectionMode::Weak
219        );
220        assert_eq!(
221            "semantic".parse::<DetectionMode>().unwrap(),
222            DetectionMode::Semantic
223        );
224    }
225
226    #[test]
227    fn detection_mode_from_str_case_insensitive() {
228        assert_eq!(
229            "STRICT".parse::<DetectionMode>().unwrap(),
230            DetectionMode::Strict
231        );
232        assert_eq!(
233            "Weak".parse::<DetectionMode>().unwrap(),
234            DetectionMode::Weak
235        );
236        assert_eq!(
237            "SEMANTIC".parse::<DetectionMode>().unwrap(),
238            DetectionMode::Semantic
239        );
240    }
241
242    #[test]
243    fn detection_mode_from_str_unknown() {
244        let err = "foobar".parse::<DetectionMode>().unwrap_err();
245        assert!(err.contains("unknown detection mode"));
246        assert!(err.contains("foobar"));
247    }
248
249    // ── DetectionMode Display ────────────────────────────────────────
250
251    #[test]
252    fn detection_mode_display() {
253        assert_eq!(DetectionMode::Strict.to_string(), "strict");
254        assert_eq!(DetectionMode::Mild.to_string(), "mild");
255        assert_eq!(DetectionMode::Weak.to_string(), "weak");
256        assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
257    }
258
259    // ── ResolvedNormalization::resolve ────────────────────────────────
260
261    #[test]
262    fn resolve_strict_mode_all_false() {
263        let resolved =
264            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
265        assert!(!resolved.ignore_identifiers);
266        assert!(!resolved.ignore_string_values);
267        assert!(!resolved.ignore_numeric_values);
268    }
269
270    #[test]
271    fn resolve_mild_mode_all_false() {
272        let resolved =
273            ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
274        assert!(!resolved.ignore_identifiers);
275        assert!(!resolved.ignore_string_values);
276        assert!(!resolved.ignore_numeric_values);
277    }
278
279    #[test]
280    fn resolve_weak_mode_only_strings_true() {
281        let resolved =
282            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
283        assert!(!resolved.ignore_identifiers);
284        assert!(resolved.ignore_string_values);
285        assert!(!resolved.ignore_numeric_values);
286    }
287
288    #[test]
289    fn resolve_semantic_mode_all_true() {
290        let resolved = ResolvedNormalization::resolve(
291            DetectionMode::Semantic,
292            &NormalizationConfig::default(),
293        );
294        assert!(resolved.ignore_identifiers);
295        assert!(resolved.ignore_string_values);
296        assert!(resolved.ignore_numeric_values);
297    }
298
299    #[test]
300    fn resolve_override_forces_true() {
301        // Strict mode defaults to all false, but override forces ignore_identifiers to true
302        let overrides = NormalizationConfig {
303            ignore_identifiers: Some(true),
304            ignore_string_values: None,
305            ignore_numeric_values: None,
306        };
307        let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
308        assert!(resolved.ignore_identifiers);
309        assert!(!resolved.ignore_string_values);
310        assert!(!resolved.ignore_numeric_values);
311    }
312
313    #[test]
314    fn resolve_override_forces_false() {
315        // Semantic mode defaults to all true, but override forces ignore_identifiers to false
316        let overrides = NormalizationConfig {
317            ignore_identifiers: Some(false),
318            ignore_string_values: Some(false),
319            ignore_numeric_values: None,
320        };
321        let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
322        assert!(!resolved.ignore_identifiers);
323        assert!(!resolved.ignore_string_values);
324        assert!(resolved.ignore_numeric_values); // not overridden
325    }
326
327    #[test]
328    fn resolve_all_overrides_on_weak() {
329        let overrides = NormalizationConfig {
330            ignore_identifiers: Some(true),
331            ignore_string_values: Some(false), // override weak default (true -> false)
332            ignore_numeric_values: Some(true),
333        };
334        let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
335        assert!(resolved.ignore_identifiers);
336        assert!(!resolved.ignore_string_values); // overridden from true to false
337        assert!(resolved.ignore_numeric_values);
338    }
339
340    // ── DuplicatesConfig deserialization ──────────────────────────────
341
342    #[test]
343    fn duplicates_config_json_all_fields() {
344        let json = r#"{
345            "enabled": false,
346            "mode": "semantic",
347            "minTokens": 100,
348            "minLines": 10,
349            "threshold": 5.0,
350            "ignore": ["**/vendor/**"],
351            "skipLocal": true,
352            "crossLanguage": true,
353            "ignoreImports": true
354        }"#;
355        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
356        assert!(!config.enabled);
357        assert_eq!(config.mode, DetectionMode::Semantic);
358        assert_eq!(config.min_tokens, 100);
359        assert_eq!(config.min_lines, 10);
360        assert!((config.threshold - 5.0).abs() < f64::EPSILON);
361        assert_eq!(config.ignore, vec!["**/vendor/**"]);
362        assert!(config.skip_local);
363        assert!(config.cross_language);
364        assert!(config.ignore_imports);
365    }
366
367    #[test]
368    fn duplicates_config_json_partial_uses_defaults() {
369        let json = r#"{"mode": "weak"}"#;
370        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
371        assert!(config.enabled); // default
372        assert_eq!(config.mode, DetectionMode::Weak);
373        assert_eq!(config.min_tokens, 50); // default
374        assert_eq!(config.min_lines, 5); // default
375    }
376
377    #[test]
378    fn normalization_config_json_overrides() {
379        let json = r#"{
380            "ignoreIdentifiers": true,
381            "ignoreStringValues": false
382        }"#;
383        let config: NormalizationConfig = serde_json::from_str(json).unwrap();
384        assert_eq!(config.ignore_identifiers, Some(true));
385        assert_eq!(config.ignore_string_values, Some(false));
386        assert_eq!(config.ignore_numeric_values, None);
387    }
388
389    // ── TOML deserialization ────────────────────────────────────────
390
391    #[test]
392    fn duplicates_config_toml_all_fields() {
393        let toml_str = r#"
394enabled = false
395mode = "weak"
396minTokens = 75
397minLines = 8
398threshold = 3.0
399ignore = ["vendor/**"]
400skipLocal = true
401crossLanguage = true
402ignoreImports = true
403
404[normalization]
405ignoreIdentifiers = true
406ignoreStringValues = true
407ignoreNumericValues = false
408"#;
409        let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
410        assert!(!config.enabled);
411        assert_eq!(config.mode, DetectionMode::Weak);
412        assert_eq!(config.min_tokens, 75);
413        assert_eq!(config.min_lines, 8);
414        assert!((config.threshold - 3.0).abs() < f64::EPSILON);
415        assert_eq!(config.ignore, vec!["vendor/**"]);
416        assert!(config.skip_local);
417        assert!(config.cross_language);
418        assert!(config.ignore_imports);
419        assert_eq!(config.normalization.ignore_identifiers, Some(true));
420        assert_eq!(config.normalization.ignore_string_values, Some(true));
421        assert_eq!(config.normalization.ignore_numeric_values, Some(false));
422    }
423
424    #[test]
425    fn duplicates_config_toml_defaults() {
426        let toml_str = "";
427        let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
428        assert!(config.enabled);
429        assert_eq!(config.mode, DetectionMode::Mild);
430        assert_eq!(config.min_tokens, 50);
431        assert_eq!(config.min_lines, 5);
432    }
433
434    // ── NormalizationConfig edge cases ──────────────────────────────
435
436    #[test]
437    fn normalization_config_default_all_none() {
438        let config = NormalizationConfig::default();
439        assert!(config.ignore_identifiers.is_none());
440        assert!(config.ignore_string_values.is_none());
441        assert!(config.ignore_numeric_values.is_none());
442    }
443
444    #[test]
445    fn normalization_config_empty_json_object() {
446        let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
447        assert!(config.ignore_identifiers.is_none());
448        assert!(config.ignore_string_values.is_none());
449        assert!(config.ignore_numeric_values.is_none());
450    }
451
452    // ── DetectionMode default ───────────────────────────────────────
453
454    #[test]
455    fn detection_mode_default_is_mild() {
456        assert_eq!(DetectionMode::default(), DetectionMode::Mild);
457    }
458
459    // ── ResolvedNormalization equality ───────────────────────────────
460
461    #[test]
462    fn resolved_normalization_equality() {
463        let a = ResolvedNormalization {
464            ignore_identifiers: true,
465            ignore_string_values: false,
466            ignore_numeric_values: true,
467        };
468        let b = ResolvedNormalization {
469            ignore_identifiers: true,
470            ignore_string_values: false,
471            ignore_numeric_values: true,
472        };
473        assert_eq!(a, b);
474
475        let c = ResolvedNormalization {
476            ignore_identifiers: false,
477            ignore_string_values: false,
478            ignore_numeric_values: true,
479        };
480        assert_ne!(a, c);
481    }
482
483    // ── Detection mode JSON deserialization ──────────────────────────
484
485    #[test]
486    fn detection_mode_json_deserialization() {
487        let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
488        assert_eq!(strict, DetectionMode::Strict);
489
490        let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
491        assert_eq!(mild, DetectionMode::Mild);
492
493        let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
494        assert_eq!(weak, DetectionMode::Weak);
495
496        let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
497        assert_eq!(semantic, DetectionMode::Semantic);
498    }
499
500    #[test]
501    fn detection_mode_invalid_json() {
502        let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
503        assert!(result.is_err());
504    }
505
506    // ── Serialize roundtrip ─────────────────────────────────────────
507
508    #[test]
509    fn duplicates_config_json_roundtrip() {
510        let config = DuplicatesConfig {
511            enabled: false,
512            mode: DetectionMode::Semantic,
513            min_tokens: 100,
514            min_lines: 10,
515            threshold: 5.5,
516            ignore: vec!["test/**".to_string()],
517            skip_local: true,
518            cross_language: true,
519            ignore_imports: true,
520            normalization: NormalizationConfig {
521                ignore_identifiers: Some(true),
522                ignore_string_values: None,
523                ignore_numeric_values: Some(false),
524            },
525        };
526        let json = serde_json::to_string(&config).unwrap();
527        let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
528        assert!(!restored.enabled);
529        assert_eq!(restored.mode, DetectionMode::Semantic);
530        assert_eq!(restored.min_tokens, 100);
531        assert_eq!(restored.min_lines, 10);
532        assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
533        assert!(restored.skip_local);
534        assert!(restored.cross_language);
535        assert!(restored.ignore_imports);
536        assert_eq!(restored.normalization.ignore_identifiers, Some(true));
537        assert!(restored.normalization.ignore_string_values.is_none());
538        assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
539    }
540
541    // ── NormalizationConfig skip_serializing_if ─────────────────────
542
543    #[test]
544    fn normalization_none_fields_not_serialized() {
545        let config = NormalizationConfig::default();
546        let json = serde_json::to_string(&config).unwrap();
547        assert!(
548            !json.contains("ignoreIdentifiers"),
549            "None fields should be skipped"
550        );
551        assert!(
552            !json.contains("ignoreStringValues"),
553            "None fields should be skipped"
554        );
555        assert!(
556            !json.contains("ignoreNumericValues"),
557            "None fields should be skipped"
558        );
559    }
560
561    #[test]
562    fn normalization_some_fields_serialized() {
563        let config = NormalizationConfig {
564            ignore_identifiers: Some(true),
565            ignore_string_values: None,
566            ignore_numeric_values: Some(false),
567        };
568        let json = serde_json::to_string(&config).unwrap();
569        assert!(json.contains("ignoreIdentifiers"));
570        assert!(!json.contains("ignoreStringValues"));
571        assert!(json.contains("ignoreNumericValues"));
572    }
573}