Skip to main content

fallow_config/config/
duplicates_config.rs

1use schemars::JsonSchema;
2use serde::{Deserialize, Serialize};
3
4const fn default_true() -> bool {
5    true
6}
7
8const fn default_min_tokens() -> usize {
9    50
10}
11
12const fn default_min_lines() -> usize {
13    5
14}
15
16/// Configuration for code duplication detection.
17#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
18#[serde(rename_all = "camelCase")]
19pub struct DuplicatesConfig {
20    /// Whether duplication detection is enabled.
21    #[serde(default = "default_true")]
22    pub enabled: bool,
23
24    /// Detection mode: strict, mild, weak, or semantic.
25    #[serde(default)]
26    pub mode: DetectionMode,
27
28    /// Minimum number of tokens for a clone.
29    #[serde(default = "default_min_tokens")]
30    pub min_tokens: usize,
31
32    /// Minimum number of lines for a clone.
33    #[serde(default = "default_min_lines")]
34    pub min_lines: usize,
35
36    /// Maximum allowed duplication percentage (0 = no limit).
37    #[serde(default)]
38    pub threshold: f64,
39
40    /// Additional ignore patterns for duplication analysis.
41    #[serde(default)]
42    pub ignore: Vec<String>,
43
44    /// Only report cross-directory duplicates.
45    #[serde(default)]
46    pub skip_local: bool,
47
48    /// Enable cross-language clone detection by stripping type annotations.
49    ///
50    /// When enabled, TypeScript type annotations (parameter types, return types,
51    /// generics, interfaces, type aliases) are stripped from the token stream,
52    /// allowing detection of clones between `.ts` and `.js` files.
53    #[serde(default)]
54    pub cross_language: bool,
55
56    /// Fine-grained normalization overrides on top of the detection mode.
57    #[serde(default)]
58    pub normalization: NormalizationConfig,
59}
60
61impl Default for DuplicatesConfig {
62    fn default() -> Self {
63        Self {
64            enabled: true,
65            mode: DetectionMode::default(),
66            min_tokens: default_min_tokens(),
67            min_lines: default_min_lines(),
68            threshold: 0.0,
69            ignore: vec![],
70            skip_local: false,
71            cross_language: false,
72            normalization: NormalizationConfig::default(),
73        }
74    }
75}
76
77/// Fine-grained normalization overrides.
78///
79/// Each option, when set to `Some(true)`, forces that normalization regardless of
80/// the detection mode. When set to `Some(false)`, it forces preservation. When
81/// `None`, the detection mode's default behavior applies.
82#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
83#[serde(rename_all = "camelCase")]
84pub struct NormalizationConfig {
85    /// Blind all identifiers (variable names, function names, etc.) to the same hash.
86    /// Default in `semantic` mode.
87    #[serde(default, skip_serializing_if = "Option::is_none")]
88    pub ignore_identifiers: Option<bool>,
89
90    /// Blind string literal values to the same hash.
91    /// Default in `weak` and `semantic` modes.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub ignore_string_values: Option<bool>,
94
95    /// Blind numeric literal values to the same hash.
96    /// Default in `semantic` mode.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub ignore_numeric_values: Option<bool>,
99}
100
101/// Resolved normalization flags: mode defaults merged with user overrides.
102#[derive(Debug, Clone, Copy, PartialEq, Eq)]
103pub struct ResolvedNormalization {
104    pub ignore_identifiers: bool,
105    pub ignore_string_values: bool,
106    pub ignore_numeric_values: bool,
107}
108
109impl ResolvedNormalization {
110    /// Resolve normalization from a detection mode and optional overrides.
111    #[must_use]
112    pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
113        let (default_ids, default_strings, default_numbers) = match mode {
114            DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
115            DetectionMode::Weak => (false, true, false),
116            DetectionMode::Semantic => (true, true, true),
117        };
118
119        Self {
120            ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
121            ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
122            ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
123        }
124    }
125}
126
127/// Detection mode controlling how aggressively tokens are normalized.
128///
129/// Since fallow uses AST-based tokenization (not lexer-based), whitespace and
130/// comments are inherently absent from the token stream. The `Strict` and `Mild`
131/// modes are currently equivalent. `Weak` mode additionally blinds string
132/// literals. `Semantic` mode blinds all identifiers and literal values for
133/// Type-2 (renamed variable) clone detection.
134#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
135#[serde(rename_all = "lowercase")]
136pub enum DetectionMode {
137    /// All tokens preserved including identifier names and literal values (Type-1 only).
138    Strict,
139    /// Default mode -- equivalent to strict for AST-based tokenization.
140    #[default]
141    Mild,
142    /// Blind string literal values (structure-preserving).
143    Weak,
144    /// Blind all identifiers and literal values for structural (Type-2) detection.
145    Semantic,
146}
147
148impl std::fmt::Display for DetectionMode {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        match self {
151            Self::Strict => write!(f, "strict"),
152            Self::Mild => write!(f, "mild"),
153            Self::Weak => write!(f, "weak"),
154            Self::Semantic => write!(f, "semantic"),
155        }
156    }
157}
158
159impl std::str::FromStr for DetectionMode {
160    type Err = String;
161
162    fn from_str(s: &str) -> Result<Self, Self::Err> {
163        match s.to_lowercase().as_str() {
164            "strict" => Ok(Self::Strict),
165            "mild" => Ok(Self::Mild),
166            "weak" => Ok(Self::Weak),
167            "semantic" => Ok(Self::Semantic),
168            other => Err(format!("unknown detection mode: '{other}'")),
169        }
170    }
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    // ── DuplicatesConfig defaults ────────────────────────────────────
178
179    #[test]
180    fn duplicates_config_defaults() {
181        let config = DuplicatesConfig::default();
182        assert!(config.enabled);
183        assert_eq!(config.mode, DetectionMode::Mild);
184        assert_eq!(config.min_tokens, 50);
185        assert_eq!(config.min_lines, 5);
186        assert!((config.threshold - 0.0).abs() < f64::EPSILON);
187        assert!(config.ignore.is_empty());
188        assert!(!config.skip_local);
189        assert!(!config.cross_language);
190    }
191
192    // ── DetectionMode FromStr ────────────────────────────────────────
193
194    #[test]
195    fn detection_mode_from_str_all_variants() {
196        assert_eq!(
197            "strict".parse::<DetectionMode>().unwrap(),
198            DetectionMode::Strict
199        );
200        assert_eq!(
201            "mild".parse::<DetectionMode>().unwrap(),
202            DetectionMode::Mild
203        );
204        assert_eq!(
205            "weak".parse::<DetectionMode>().unwrap(),
206            DetectionMode::Weak
207        );
208        assert_eq!(
209            "semantic".parse::<DetectionMode>().unwrap(),
210            DetectionMode::Semantic
211        );
212    }
213
214    #[test]
215    fn detection_mode_from_str_case_insensitive() {
216        assert_eq!(
217            "STRICT".parse::<DetectionMode>().unwrap(),
218            DetectionMode::Strict
219        );
220        assert_eq!(
221            "Weak".parse::<DetectionMode>().unwrap(),
222            DetectionMode::Weak
223        );
224        assert_eq!(
225            "SEMANTIC".parse::<DetectionMode>().unwrap(),
226            DetectionMode::Semantic
227        );
228    }
229
230    #[test]
231    fn detection_mode_from_str_unknown() {
232        let err = "foobar".parse::<DetectionMode>().unwrap_err();
233        assert!(err.contains("unknown detection mode"));
234        assert!(err.contains("foobar"));
235    }
236
237    // ── DetectionMode Display ────────────────────────────────────────
238
239    #[test]
240    fn detection_mode_display() {
241        assert_eq!(DetectionMode::Strict.to_string(), "strict");
242        assert_eq!(DetectionMode::Mild.to_string(), "mild");
243        assert_eq!(DetectionMode::Weak.to_string(), "weak");
244        assert_eq!(DetectionMode::Semantic.to_string(), "semantic");
245    }
246
247    // ── ResolvedNormalization::resolve ────────────────────────────────
248
249    #[test]
250    fn resolve_strict_mode_all_false() {
251        let resolved =
252            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
253        assert!(!resolved.ignore_identifiers);
254        assert!(!resolved.ignore_string_values);
255        assert!(!resolved.ignore_numeric_values);
256    }
257
258    #[test]
259    fn resolve_mild_mode_all_false() {
260        let resolved =
261            ResolvedNormalization::resolve(DetectionMode::Mild, &NormalizationConfig::default());
262        assert!(!resolved.ignore_identifiers);
263        assert!(!resolved.ignore_string_values);
264        assert!(!resolved.ignore_numeric_values);
265    }
266
267    #[test]
268    fn resolve_weak_mode_only_strings_true() {
269        let resolved =
270            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
271        assert!(!resolved.ignore_identifiers);
272        assert!(resolved.ignore_string_values);
273        assert!(!resolved.ignore_numeric_values);
274    }
275
276    #[test]
277    fn resolve_semantic_mode_all_true() {
278        let resolved = ResolvedNormalization::resolve(
279            DetectionMode::Semantic,
280            &NormalizationConfig::default(),
281        );
282        assert!(resolved.ignore_identifiers);
283        assert!(resolved.ignore_string_values);
284        assert!(resolved.ignore_numeric_values);
285    }
286
287    #[test]
288    fn resolve_override_forces_true() {
289        // Strict mode defaults to all false, but override forces ignore_identifiers to true
290        let overrides = NormalizationConfig {
291            ignore_identifiers: Some(true),
292            ignore_string_values: None,
293            ignore_numeric_values: None,
294        };
295        let resolved = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
296        assert!(resolved.ignore_identifiers);
297        assert!(!resolved.ignore_string_values);
298        assert!(!resolved.ignore_numeric_values);
299    }
300
301    #[test]
302    fn resolve_override_forces_false() {
303        // Semantic mode defaults to all true, but override forces ignore_identifiers to false
304        let overrides = NormalizationConfig {
305            ignore_identifiers: Some(false),
306            ignore_string_values: Some(false),
307            ignore_numeric_values: None,
308        };
309        let resolved = ResolvedNormalization::resolve(DetectionMode::Semantic, &overrides);
310        assert!(!resolved.ignore_identifiers);
311        assert!(!resolved.ignore_string_values);
312        assert!(resolved.ignore_numeric_values); // not overridden
313    }
314
315    #[test]
316    fn resolve_all_overrides_on_weak() {
317        let overrides = NormalizationConfig {
318            ignore_identifiers: Some(true),
319            ignore_string_values: Some(false), // override weak default (true -> false)
320            ignore_numeric_values: Some(true),
321        };
322        let resolved = ResolvedNormalization::resolve(DetectionMode::Weak, &overrides);
323        assert!(resolved.ignore_identifiers);
324        assert!(!resolved.ignore_string_values); // overridden from true to false
325        assert!(resolved.ignore_numeric_values);
326    }
327
328    // ── DuplicatesConfig deserialization ──────────────────────────────
329
330    #[test]
331    fn duplicates_config_json_all_fields() {
332        let json = r#"{
333            "enabled": false,
334            "mode": "semantic",
335            "minTokens": 100,
336            "minLines": 10,
337            "threshold": 5.0,
338            "ignore": ["**/vendor/**"],
339            "skipLocal": true,
340            "crossLanguage": true
341        }"#;
342        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
343        assert!(!config.enabled);
344        assert_eq!(config.mode, DetectionMode::Semantic);
345        assert_eq!(config.min_tokens, 100);
346        assert_eq!(config.min_lines, 10);
347        assert!((config.threshold - 5.0).abs() < f64::EPSILON);
348        assert_eq!(config.ignore, vec!["**/vendor/**"]);
349        assert!(config.skip_local);
350        assert!(config.cross_language);
351    }
352
353    #[test]
354    fn duplicates_config_json_partial_uses_defaults() {
355        let json = r#"{"mode": "weak"}"#;
356        let config: DuplicatesConfig = serde_json::from_str(json).unwrap();
357        assert!(config.enabled); // default
358        assert_eq!(config.mode, DetectionMode::Weak);
359        assert_eq!(config.min_tokens, 50); // default
360        assert_eq!(config.min_lines, 5); // default
361    }
362
363    #[test]
364    fn normalization_config_json_overrides() {
365        let json = r#"{
366            "ignoreIdentifiers": true,
367            "ignoreStringValues": false
368        }"#;
369        let config: NormalizationConfig = serde_json::from_str(json).unwrap();
370        assert_eq!(config.ignore_identifiers, Some(true));
371        assert_eq!(config.ignore_string_values, Some(false));
372        assert_eq!(config.ignore_numeric_values, None);
373    }
374}