Skip to main content

fallow_config/config/
duplicates_config.rs

1use schemars::JsonSchema;
2use serde::{Deserialize, Serialize};
3
4const fn default_true() -> bool {
5    true
6}
7
8const fn default_min_tokens() -> usize {
9    50
10}
11
12const fn default_min_lines() -> usize {
13    5
14}
15
16/// Configuration for code duplication detection.
17#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
18#[serde(rename_all = "camelCase")]
19pub struct DuplicatesConfig {
20    /// Whether duplication detection is enabled.
21    #[serde(default = "default_true")]
22    pub enabled: bool,
23
24    /// Detection mode: strict, mild, weak, or semantic.
25    #[serde(default)]
26    pub mode: DetectionMode,
27
28    /// Minimum number of tokens for a clone.
29    #[serde(default = "default_min_tokens")]
30    pub min_tokens: usize,
31
32    /// Minimum number of lines for a clone.
33    #[serde(default = "default_min_lines")]
34    pub min_lines: usize,
35
36    /// Maximum allowed duplication percentage (0 = no limit).
37    #[serde(default)]
38    pub threshold: f64,
39
40    /// Additional ignore patterns for duplication analysis.
41    #[serde(default)]
42    pub ignore: Vec<String>,
43
44    /// Only report cross-directory duplicates.
45    #[serde(default)]
46    pub skip_local: bool,
47
48    /// Enable cross-language clone detection by stripping type annotations.
49    ///
50    /// When enabled, TypeScript type annotations (parameter types, return types,
51    /// generics, interfaces, type aliases) are stripped from the token stream,
52    /// allowing detection of clones between `.ts` and `.js` files.
53    #[serde(default)]
54    pub cross_language: bool,
55
56    /// Fine-grained normalization overrides on top of the detection mode.
57    #[serde(default)]
58    pub normalization: NormalizationConfig,
59}
60
61impl Default for DuplicatesConfig {
62    fn default() -> Self {
63        Self {
64            enabled: true,
65            mode: DetectionMode::default(),
66            min_tokens: default_min_tokens(),
67            min_lines: default_min_lines(),
68            threshold: 0.0,
69            ignore: vec![],
70            skip_local: false,
71            cross_language: false,
72            normalization: NormalizationConfig::default(),
73        }
74    }
75}
76
77/// Fine-grained normalization overrides.
78///
79/// Each option, when set to `Some(true)`, forces that normalization regardless of
80/// the detection mode. When set to `Some(false)`, it forces preservation. When
81/// `None`, the detection mode's default behavior applies.
82#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
83#[serde(rename_all = "camelCase")]
84pub struct NormalizationConfig {
85    /// Blind all identifiers (variable names, function names, etc.) to the same hash.
86    /// Default in `semantic` mode.
87    #[serde(default, skip_serializing_if = "Option::is_none")]
88    pub ignore_identifiers: Option<bool>,
89
90    /// Blind string literal values to the same hash.
91    /// Default in `weak` and `semantic` modes.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub ignore_string_values: Option<bool>,
94
95    /// Blind numeric literal values to the same hash.
96    /// Default in `semantic` mode.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub ignore_numeric_values: Option<bool>,
99}
100
101/// Resolved normalization flags: mode defaults merged with user overrides.
102#[derive(Debug, Clone, Copy, PartialEq, Eq)]
103pub struct ResolvedNormalization {
104    pub ignore_identifiers: bool,
105    pub ignore_string_values: bool,
106    pub ignore_numeric_values: bool,
107}
108
109impl ResolvedNormalization {
110    /// Resolve normalization from a detection mode and optional overrides.
111    pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
112        let (default_ids, default_strings, default_numbers) = match mode {
113            DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
114            DetectionMode::Weak => (false, true, false),
115            DetectionMode::Semantic => (true, true, true),
116        };
117
118        Self {
119            ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
120            ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
121            ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
122        }
123    }
124}
125
126/// Detection mode controlling how aggressively tokens are normalized.
127///
128/// Since fallow uses AST-based tokenization (not lexer-based), whitespace and
129/// comments are inherently absent from the token stream. The `Strict` and `Mild`
130/// modes are currently equivalent. `Weak` mode additionally blinds string
131/// literals. `Semantic` mode blinds all identifiers and literal values for
132/// Type-2 (renamed variable) clone detection.
133#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
134#[serde(rename_all = "lowercase")]
135pub enum DetectionMode {
136    /// All tokens preserved including identifier names and literal values (Type-1 only).
137    Strict,
138    /// Default mode -- equivalent to strict for AST-based tokenization.
139    #[default]
140    Mild,
141    /// Blind string literal values (structure-preserving).
142    Weak,
143    /// Blind all identifiers and literal values for structural (Type-2) detection.
144    Semantic,
145}
146
147impl std::fmt::Display for DetectionMode {
148    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
149        match self {
150            Self::Strict => write!(f, "strict"),
151            Self::Mild => write!(f, "mild"),
152            Self::Weak => write!(f, "weak"),
153            Self::Semantic => write!(f, "semantic"),
154        }
155    }
156}
157
158impl std::str::FromStr for DetectionMode {
159    type Err = String;
160
161    fn from_str(s: &str) -> Result<Self, Self::Err> {
162        match s.to_lowercase().as_str() {
163            "strict" => Ok(Self::Strict),
164            "mild" => Ok(Self::Mild),
165            "weak" => Ok(Self::Weak),
166            "semantic" => Ok(Self::Semantic),
167            other => Err(format!("unknown detection mode: '{other}'")),
168        }
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    use super::*;
175
176    // ── DuplicatesConfig defaults ────────────────────────────────────
177
178    #[test]
179    fn duplicates_config_defaults() {
180        let config = DuplicatesConfig::default();
181        assert!(config.enabled);
182        assert_eq!(config.mode, DetectionMode::Mild);
183        assert_eq!(config.min_tokens, 50);
184        assert_eq!(config.min_lines, 5);
185        assert_eq!(config.threshold, 0.0);
186        assert!(config.ignore.is_empty());
187        assert!(!config.skip_local);
188        assert!(!config.cross_language);
189    }
190
191    // ── DetectionMode FromStr ────────────────────────────────────────
192
193    #[test]
194    fn detection_mode_from_str_all_variants() {
195        assert_eq!(
196            "strict".parse::<DetectionMode>().unwrap(),
197            DetectionMode::Strict
198        );
199        assert_eq!(
200            "mild".parse::<DetectionMode>().unwrap(),
201            DetectionMode::Mild
202        );
203        assert_eq!(
204            "weak".parse::<DetectionMode>().unwrap(),
205            DetectionMode::Weak
206        );
207        assert_eq!(
208            "semantic".parse::<DetectionMode>().unwrap(),
209            DetectionMode::Semantic
210        );
211    }
212
213    #[test]
214    fn detection_mode_from_str_case_insensitive() {
215        assert_eq!(
216            "STRICT".parse::<DetectionMode>().unwrap(),
217            DetectionMode::Strict
218        );
219        assert_eq!(
220            "Weak".parse::<DetectionMode>().unwrap(),
221            DetectionMode::Weak
222        );
223        assert_eq!(
224            "SEMANTIC".parse::<DetectionMode>().unwrap(),
225            DetectionMode::Semantic
226        );
227    }
228
229    #[test]
230    fn detection_mode_from_str_unknown() {
231        let err = "foobar".parse::<DetectionMode>().unwrap_err();
232        assert!(err.contains("unknown detection mode"));
233        assert!(err.contains("foobar"));
234    }
235
236    // ── DetectionMode Display ────────────────────────────────────────
237
238    #[test]
239    fn detection_mode_display() {
240        assert_eq!(DetectionMode::Strict.to_string(), "strict");
241        assert_eq!(DetectionMode::Mild.to_string(), "mild");
242        assert_eq!(DetectionMode::Weak.to_string(), "weak");
243        assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
244    }
245
246    // ── ResolvedNormalization::resolve ────────────────────────────────
247
248    #[test]
249    fn resolve_strict_mode_all_false() {
250        let resolved =
251            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
252        assert!(!resolved.ignore_identifiers);
253        assert!(!resolved.ignore_string_values);
254        assert!(!resolved.ignore_numeric_values);
255    }
256
257    #[test]
258    fn resolve_mild_mode_all_false() {
259        let resolved =
260            ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
261        assert!(!resolved.ignore_identifiers);
262        assert!(!resolved.ignore_string_values);
263        assert!(!resolved.ignore_numeric_values);
264    }
265
266    #[test]
267    fn resolve_weak_mode_only_strings_true() {
268        let resolved =
269            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
270        assert!(!resolved.ignore_identifiers);
271        assert!(resolved.ignore_string_values);
272        assert!(!resolved.ignore_numeric_values);
273    }
274
275    #[test]
276    fn resolve_semantic_mode_all_true() {
277        let resolved = ResolvedNormalization::resolve(
278            DetectionMode::Semantic,
279            &NormalizationConfig::default(),
280        );
281        assert!(resolved.ignore_identifiers);
282        assert!(resolved.ignore_string_values);
283        assert!(resolved.ignore_numeric_values);
284    }
285
286    #[test]
287    fn resolve_override_forces_true() {
288        // Strict mode defaults to all false, but override forces ignore_identifiers to true
289        let overrides = NormalizationConfig {
290            ignore_identifiers: Some(true),
291            ignore_string_values: None,
292            ignore_numeric_values: None,
293        };
294        let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
295        assert!(resolved.ignore_identifiers);
296        assert!(!resolved.ignore_string_values);
297        assert!(!resolved.ignore_numeric_values);
298    }
299
300    #[test]
301    fn resolve_override_forces_false() {
302        // Semantic mode defaults to all true, but override forces ignore_identifiers to false
303        let overrides = NormalizationConfig {
304            ignore_identifiers: Some(false),
305            ignore_string_values: Some(false),
306            ignore_numeric_values: None,
307        };
308        let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
309        assert!(!resolved.ignore_identifiers);
310        assert!(!resolved.ignore_string_values);
311        assert!(resolved.ignore_numeric_values); // not overridden
312    }
313
314    #[test]
315    fn resolve_all_overrides_on_weak() {
316        let overrides = NormalizationConfig {
317            ignore_identifiers: Some(true),
318            ignore_string_values: Some(false), // override weak default (true -> false)
319            ignore_numeric_values: Some(true),
320        };
321        let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
322        assert!(resolved.ignore_identifiers);
323        assert!(!resolved.ignore_string_values); // overridden from true to false
324        assert!(resolved.ignore_numeric_values);
325    }
326
327    // ── DuplicatesConfig deserialization ──────────────────────────────
328
329    #[test]
330    fn duplicates_config_json_all_fields() {
331        let json = r#"{
332            "enabled": false,
333            "mode": "semantic",
334            "minTokens": 100,
335            "minLines": 10,
336            "threshold": 5.0,
337            "ignore": ["**/vendor/**"],
338            "skipLocal": true,
339            "crossLanguage": true
340        }"#;
341        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
342        assert!(!config.enabled);
343        assert_eq!(config.mode, DetectionMode::Semantic);
344        assert_eq!(config.min_tokens, 100);
345        assert_eq!(config.min_lines, 10);
346        assert_eq!(config.threshold, 5.0);
347        assert_eq!(config.ignore, vec!["**/vendor/**"]);
348        assert!(config.skip_local);
349        assert!(config.cross_language);
350    }
351
352    #[test]
353    fn duplicates_config_json_partial_uses_defaults() {
354        let json = r#"{"mode": "weak"}"#;
355        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
356        assert!(config.enabled); // default
357        assert_eq!(config.mode, DetectionMode::Weak);
358        assert_eq!(config.min_tokens, 50); // default
359        assert_eq!(config.min_lines, 5); // default
360    }
361
362    #[test]
363    fn normalization_config_json_overrides() {
364        let json = r#"{
365            "ignoreIdentifiers": true,
366            "ignoreStringValues": false
367        }"#;
368        let config: NormalizationConfig = serde_json::from_str(json).unwrap();
369        assert_eq!(config.ignore_identifiers, Some(true));
370        assert_eq!(config.ignore_string_values, Some(false));
371        assert_eq!(config.ignore_numeric_values, None);
372    }
373}