Skip to main content

fallow_config/config/
duplicates_config.rs

1use schemars::JsonSchema;
2use serde::{Deserialize, Serialize};
3
4const fn default_true() -> bool {
5    true
6}
7
8const fn default_min_tokens() -> usize {
9    50
10}
11
12const fn default_min_lines() -> usize {
13    5
14}
15
16/// Configuration for code duplication detection.
17#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
18#[serde(rename_all = "camelCase")]
19pub struct DuplicatesConfig {
20    /// Whether duplication detection is enabled.
21    #[serde(default = "default_true")]
22    pub enabled: bool,
23
24    /// Detection mode: strict, mild, weak, or semantic.
25    #[serde(default)]
26    pub mode: DetectionMode,
27
28    /// Minimum number of tokens for a clone.
29    #[serde(default = "default_min_tokens")]
30    pub min_tokens: usize,
31
32    /// Minimum number of lines for a clone.
33    #[serde(default = "default_min_lines")]
34    pub min_lines: usize,
35
36    /// Maximum allowed duplication percentage (0 = no limit).
37    #[serde(default)]
38    pub threshold: f64,
39
40    /// Additional ignore patterns for duplication analysis.
41    #[serde(default)]
42    pub ignore: Vec<String>,
43
44    /// Only report cross-directory duplicates.
45    #[serde(default)]
46    pub skip_local: bool,
47
48    /// Enable cross-language clone detection by stripping type annotations.
49    ///
50    /// When enabled, TypeScript type annotations (parameter types, return types,
51    /// generics, interfaces, type aliases) are stripped from the token stream,
52    /// allowing detection of clones between `.ts` and `.js` files.
53    #[serde(default)]
54    pub cross_language: bool,
55
56    /// Fine-grained normalization overrides on top of the detection mode.
57    #[serde(default)]
58    pub normalization: NormalizationConfig,
59}
60
61impl Default for DuplicatesConfig {
62    fn default() -> Self {
63        Self {
64            enabled: true,
65            mode: DetectionMode::default(),
66            min_tokens: default_min_tokens(),
67            min_lines: default_min_lines(),
68            threshold: 0.0,
69            ignore: vec![],
70            skip_local: false,
71            cross_language: false,
72            normalization: NormalizationConfig::default(),
73        }
74    }
75}
76
77/// Fine-grained normalization overrides.
78///
79/// Each option, when set to `Some(true)`, forces that normalization regardless of
80/// the detection mode. When set to `Some(false)`, it forces preservation. When
81/// `None`, the detection mode's default behavior applies.
82#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
83#[serde(rename_all = "camelCase")]
84pub struct NormalizationConfig {
85    /// Blind all identifiers (variable names, function names, etc.) to the same hash.
86    /// Default in `semantic` mode.
87    #[serde(default, skip_serializing_if = "Option::is_none")]
88    pub ignore_identifiers: Option<bool>,
89
90    /// Blind string literal values to the same hash.
91    /// Default in `weak` and `semantic` modes.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub ignore_string_values: Option<bool>,
94
95    /// Blind numeric literal values to the same hash.
96    /// Default in `semantic` mode.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub ignore_numeric_values: Option<bool>,
99}
100
101/// Resolved normalization flags: mode defaults merged with user overrides.
102#[derive(Debug, Clone, Copy, PartialEq, Eq)]
103pub struct ResolvedNormalization {
104    pub ignore_identifiers: bool,
105    pub ignore_string_values: bool,
106    pub ignore_numeric_values: bool,
107}
108
109impl ResolvedNormalization {
110    /// Resolve normalization from a detection mode and optional overrides.
111    #[must_use]
112    pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
113        let (default_ids, default_strings, default_numbers) = match mode {
114            DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
115            DetectionMode::Weak => (false, true, false),
116            DetectionMode::Semantic => (true, true, true),
117        };
118
119        Self {
120            ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
121            ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
122            ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
123        }
124    }
125}
126
127/// Detection mode controlling how aggressively tokens are normalized.
128///
129/// Since fallow uses AST-based tokenization (not lexer-based), whitespace and
130/// comments are inherently absent from the token stream. The `Strict` and `Mild`
131/// modes are currently equivalent. `Weak` mode additionally blinds string
132/// literals. `Semantic` mode blinds all identifiers and literal values for
133/// Type-2 (renamed variable) clone detection.
134#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
135#[serde(rename_all = "lowercase")]
136pub enum DetectionMode {
137    /// All tokens preserved including identifier names and literal values (Type-1 only).
138    Strict,
139    /// Default mode -- equivalent to strict for AST-based tokenization.
140    #[default]
141    Mild,
142    /// Blind string literal values (structure-preserving).
143    Weak,
144    /// Blind all identifiers and literal values for structural (Type-2) detection.
145    Semantic,
146}
147
148impl std::fmt::Display for DetectionMode {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        match self {
151            Self::Strict => write!(f, "strict"),
152            Self::Mild => write!(f, "mild"),
153            Self::Weak => write!(f, "weak"),
154            Self::Semantic => write!(f, "semantic"),
155        }
156    }
157}
158
159impl std::str::FromStr for DetectionMode {
160    type Err = String;
161
162    fn from_str(s: &str) -> Result<Self, Self::Err> {
163        match s.to_lowercase().as_str() {
164            "strict" => Ok(Self::Strict),
165            "mild" => Ok(Self::Mild),
166            "weak" => Ok(Self::Weak),
167            "semantic" => Ok(Self::Semantic),
168            other => Err(format!("unknown detection mode: '{other}'")),
169        }
170    }
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    // ── DuplicatesConfig defaults ────────────────────────────────────
178
179    #[test]
180    fn duplicates_config_defaults() {
181        let config = DuplicatesConfig::default();
182        assert!(config.enabled);
183        assert_eq!(config.mode, DetectionMode::Mild);
184        assert_eq!(config.min_tokens, 50);
185        assert_eq!(config.min_lines, 5);
186        assert!((config.threshold - 0.0).abs() < f64::EPSILON);
187        assert!(config.ignore.is_empty());
188        assert!(!config.skip_local);
189        assert!(!config.cross_language);
190    }
191
192    // ── DetectionMode FromStr ────────────────────────────────────────
193
194    #[test]
195    fn detection_mode_from_str_all_variants() {
196        assert_eq!(
197            "strict".parse::<DetectionMode>().unwrap(),
198            DetectionMode::Strict
199        );
200        assert_eq!(
201            "mild".parse::<DetectionMode>().unwrap(),
202            DetectionMode::Mild
203        );
204        assert_eq!(
205            "weak".parse::<DetectionMode>().unwrap(),
206            DetectionMode::Weak
207        );
208        assert_eq!(
209            "semantic".parse::<DetectionMode>().unwrap(),
210            DetectionMode::Semantic
211        );
212    }
213
214    #[test]
215    fn detection_mode_from_str_case_insensitive() {
216        assert_eq!(
217            "STRICT".parse::<DetectionMode>().unwrap(),
218            DetectionMode::Strict
219        );
220        assert_eq!(
221            "Weak".parse::<DetectionMode>().unwrap(),
222            DetectionMode::Weak
223        );
224        assert_eq!(
225            "SEMANTIC".parse::<DetectionMode>().unwrap(),
226            DetectionMode::Semantic
227        );
228    }
229
230    #[test]
231    fn detection_mode_from_str_unknown() {
232        let err = "foobar".parse::<DetectionMode>().unwrap_err();
233        assert!(err.contains("unknown detection mode"));
234        assert!(err.contains("foobar"));
235    }
236
237    // ── DetectionMode Display ────────────────────────────────────────
238
239    #[test]
240    fn detection_mode_display() {
241        assert_eq!(DetectionMode::Strict.to_string(), "strict");
242        assert_eq!(DetectionMode::Mild.to_string(), "mild");
243        assert_eq!(DetectionMode::Weak.to_string(), "weak");
244        assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
245    }
246
247    // ── ResolvedNormalization::resolve ────────────────────────────────
248
249    #[test]
250    fn resolve_strict_mode_all_false() {
251        let resolved =
252            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
253        assert!(!resolved.ignore_identifiers);
254        assert!(!resolved.ignore_string_values);
255        assert!(!resolved.ignore_numeric_values);
256    }
257
258    #[test]
259    fn resolve_mild_mode_all_false() {
260        let resolved =
261            ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
262        assert!(!resolved.ignore_identifiers);
263        assert!(!resolved.ignore_string_values);
264        assert!(!resolved.ignore_numeric_values);
265    }
266
267    #[test]
268    fn resolve_weak_mode_only_strings_true() {
269        let resolved =
270            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
271        assert!(!resolved.ignore_identifiers);
272        assert!(resolved.ignore_string_values);
273        assert!(!resolved.ignore_numeric_values);
274    }
275
276    #[test]
277    fn resolve_semantic_mode_all_true() {
278        let resolved = ResolvedNormalization::resolve(
279            DetectionMode::Semantic,
280            &NormalizationConfig::default(),
281        );
282        assert!(resolved.ignore_identifiers);
283        assert!(resolved.ignore_string_values);
284        assert!(resolved.ignore_numeric_values);
285    }
286
287    #[test]
288    fn resolve_override_forces_true() {
289        // Strict mode defaults to all false, but override forces ignore_identifiers to true
290        let overrides = NormalizationConfig {
291            ignore_identifiers: Some(true),
292            ignore_string_values: None,
293            ignore_numeric_values: None,
294        };
295        let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
296        assert!(resolved.ignore_identifiers);
297        assert!(!resolved.ignore_string_values);
298        assert!(!resolved.ignore_numeric_values);
299    }
300
301    #[test]
302    fn resolve_override_forces_false() {
303        // Semantic mode defaults to all true, but override forces ignore_identifiers to false
304        let overrides = NormalizationConfig {
305            ignore_identifiers: Some(false),
306            ignore_string_values: Some(false),
307            ignore_numeric_values: None,
308        };
309        let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
310        assert!(!resolved.ignore_identifiers);
311        assert!(!resolved.ignore_string_values);
312        assert!(resolved.ignore_numeric_values); // not overridden
313    }
314
315    #[test]
316    fn resolve_all_overrides_on_weak() {
317        let overrides = NormalizationConfig {
318            ignore_identifiers: Some(true),
319            ignore_string_values: Some(false), // override weak default (true -> false)
320            ignore_numeric_values: Some(true),
321        };
322        let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
323        assert!(resolved.ignore_identifiers);
324        assert!(!resolved.ignore_string_values); // overridden from true to false
325        assert!(resolved.ignore_numeric_values);
326    }
327
328    // ── DuplicatesConfig deserialization ──────────────────────────────
329
330    #[test]
331    fn duplicates_config_json_all_fields() {
332        let json = r#"{
333            "enabled": false,
334            "mode": "semantic",
335            "minTokens": 100,
336            "minLines": 10,
337            "threshold": 5.0,
338            "ignore": ["**/vendor/**"],
339            "skipLocal": true,
340            "crossLanguage": true
341        }"#;
342        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
343        assert!(!config.enabled);
344        assert_eq!(config.mode, DetectionMode::Semantic);
345        assert_eq!(config.min_tokens, 100);
346        assert_eq!(config.min_lines, 10);
347        assert!((config.threshold - 5.0).abs() < f64::EPSILON);
348        assert_eq!(config.ignore, vec!["**/vendor/**"]);
349        assert!(config.skip_local);
350        assert!(config.cross_language);
351    }
352
353    #[test]
354    fn duplicates_config_json_partial_uses_defaults() {
355        let json = r#"{"mode": "weak"}"#;
356        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
357        assert!(config.enabled); // default
358        assert_eq!(config.mode, DetectionMode::Weak);
359        assert_eq!(config.min_tokens, 50); // default
360        assert_eq!(config.min_lines, 5); // default
361    }
362
363    #[test]
364    fn normalization_config_json_overrides() {
365        let json = r#"{
366            "ignoreIdentifiers": true,
367            "ignoreStringValues": false
368        }"#;
369        let config: NormalizationConfig = serde_json::from_str(json).unwrap();
370        assert_eq!(config.ignore_identifiers, Some(true));
371        assert_eq!(config.ignore_string_values, Some(false));
372        assert_eq!(config.ignore_numeric_values, None);
373    }
374
375    // ── TOML deserialization ────────────────────────────────────────
376
377    #[test]
378    fn duplicates_config_toml_all_fields() {
379        let toml_str = r#"
380enabled = false
381mode = "weak"
382minTokens = 75
383minLines = 8
384threshold = 3.0
385ignore = ["vendor/**"]
386skipLocal = true
387crossLanguage = true
388
389[normalization]
390ignoreIdentifiers = true
391ignoreStringValues = true
392ignoreNumericValues = false
393"#;
394        let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
395        assert!(!config.enabled);
396        assert_eq!(config.mode, DetectionMode::Weak);
397        assert_eq!(config.min_tokens, 75);
398        assert_eq!(config.min_lines, 8);
399        assert!((config.threshold - 3.0).abs() < f64::EPSILON);
400        assert_eq!(config.ignore, vec!["vendor/**"]);
401        assert!(config.skip_local);
402        assert!(config.cross_language);
403        assert_eq!(config.normalization.ignore_identifiers, Some(true));
404        assert_eq!(config.normalization.ignore_string_values, Some(true));
405        assert_eq!(config.normalization.ignore_numeric_values, Some(false));
406    }
407
408    #[test]
409    fn duplicates_config_toml_defaults() {
410        let toml_str = "";
411        let config: DuplicatesConfig = toml::from_str(toml_str).unwrap();
412        assert!(config.enabled);
413        assert_eq!(config.mode, DetectionMode::Mild);
414        assert_eq!(config.min_tokens, 50);
415        assert_eq!(config.min_lines, 5);
416    }
417
418    // ── NormalizationConfig edge cases ──────────────────────────────
419
420    #[test]
421    fn normalization_config_default_all_none() {
422        let config = NormalizationConfig::default();
423        assert!(config.ignore_identifiers.is_none());
424        assert!(config.ignore_string_values.is_none());
425        assert!(config.ignore_numeric_values.is_none());
426    }
427
428    #[test]
429    fn normalization_config_empty_json_object() {
430        let config: NormalizationConfig = serde_json::from_str("{}").unwrap();
431        assert!(config.ignore_identifiers.is_none());
432        assert!(config.ignore_string_values.is_none());
433        assert!(config.ignore_numeric_values.is_none());
434    }
435
436    // ── DetectionMode default ───────────────────────────────────────
437
438    #[test]
439    fn detection_mode_default_is_mild() {
440        assert_eq!(DetectionMode::default(), DetectionMode::Mild);
441    }
442
443    // ── ResolvedNormalization equality ───────────────────────────────
444
445    #[test]
446    fn resolved_normalization_equality() {
447        let a = ResolvedNormalization {
448            ignore_identifiers: true,
449            ignore_string_values: false,
450            ignore_numeric_values: true,
451        };
452        let b = ResolvedNormalization {
453            ignore_identifiers: true,
454            ignore_string_values: false,
455            ignore_numeric_values: true,
456        };
457        assert_eq!(a, b);
458
459        let c = ResolvedNormalization {
460            ignore_identifiers: false,
461            ignore_string_values: false,
462            ignore_numeric_values: true,
463        };
464        assert_ne!(a, c);
465    }
466
467    // ── Detection mode JSON deserialization ──────────────────────────
468
469    #[test]
470    fn detection_mode_json_deserialization() {
471        let strict: DetectionMode = serde_json::from_str(r#""strict""#).unwrap();
472        assert_eq!(strict, DetectionMode::Strict);
473
474        let mild: DetectionMode = serde_json::from_str(r#""mild""#).unwrap();
475        assert_eq!(mild, DetectionMode::Mild);
476
477        let weak: DetectionMode = serde_json::from_str(r#""weak""#).unwrap();
478        assert_eq!(weak, DetectionMode::Weak);
479
480        let semantic: DetectionMode = serde_json::from_str(r#""semantic""#).unwrap();
481        assert_eq!(semantic, DetectionMode::Semantic);
482    }
483
484    #[test]
485    fn detection_mode_invalid_json() {
486        let result: Result<DetectionMode, _> = serde_json::from_str(r#""aggressive""#);
487        assert!(result.is_err());
488    }
489
490    // ── Serialize roundtrip ─────────────────────────────────────────
491
492    #[test]
493    fn duplicates_config_json_roundtrip() {
494        let config = DuplicatesConfig {
495            enabled: false,
496            mode: DetectionMode::Semantic,
497            min_tokens: 100,
498            min_lines: 10,
499            threshold: 5.5,
500            ignore: vec!["test/**".to_string()],
501            skip_local: true,
502            cross_language: true,
503            normalization: NormalizationConfig {
504                ignore_identifiers: Some(true),
505                ignore_string_values: None,
506                ignore_numeric_values: Some(false),
507            },
508        };
509        let json = serde_json::to_string(&config).unwrap();
510        let restored: DuplicatesConfig = serde_json::from_str(&json).unwrap();
511        assert!(!restored.enabled);
512        assert_eq!(restored.mode, DetectionMode::Semantic);
513        assert_eq!(restored.min_tokens, 100);
514        assert_eq!(restored.min_lines, 10);
515        assert!((restored.threshold - 5.5).abs() < f64::EPSILON);
516        assert!(restored.skip_local);
517        assert!(restored.cross_language);
518        assert_eq!(restored.normalization.ignore_identifiers, Some(true));
519        assert!(restored.normalization.ignore_string_values.is_none());
520        assert_eq!(restored.normalization.ignore_numeric_values, Some(false));
521    }
522
523    // ── NormalizationConfig skip_serializing_if ─────────────────────
524
525    #[test]
526    fn normalization_none_fields_not_serialized() {
527        let config = NormalizationConfig::default();
528        let json = serde_json::to_string(&config).unwrap();
529        assert!(
530            !json.contains("ignoreIdentifiers"),
531            "None fields should be skipped"
532        );
533        assert!(
534            !json.contains("ignoreStringValues"),
535            "None fields should be skipped"
536        );
537        assert!(
538            !json.contains("ignoreNumericValues"),
539            "None fields should be skipped"
540        );
541    }
542
543    #[test]
544    fn normalization_some_fields_serialized() {
545        let config = NormalizationConfig {
546            ignore_identifiers: Some(true),
547            ignore_string_values: None,
548            ignore_numeric_values: Some(false),
549        };
550        let json = serde_json::to_string(&config).unwrap();
551        assert!(json.contains("ignoreIdentifiers"));
552        assert!(!json.contains("ignoreStringValues"));
553        assert!(json.contains("ignoreNumericValues"));
554    }
555}