Skip to main content

fallow_config/config/
duplicates_config.rs

1use schemars::JsonSchema;
2use serde::{Deserialize, Serialize};
3
4const fn default_true() -> bool {
5    true
6}
7
8const fn default_min_tokens() -> usize {
9    50
10}
11
12const fn default_min_lines() -> usize {
13    5
14}
15
16/// Configuration for code duplication detection.
17#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
18#[serde(rename_all = "camelCase")]
19pub struct DuplicatesConfig {
20    /// Whether duplication detection is enabled.
21    #[serde(default = "default_true")]
22    pub enabled: bool,
23
24    /// Detection mode: strict, mild, weak, or semantic.
25    #[serde(default)]
26    pub mode: DetectionMode,
27
28    /// Minimum number of tokens for a clone.
29    #[serde(default = "default_min_tokens")]
30    pub min_tokens: usize,
31
32    /// Minimum number of lines for a clone.
33    #[serde(default = "default_min_lines")]
34    pub min_lines: usize,
35
36    /// Maximum allowed duplication percentage (0 = no limit).
37    #[serde(default)]
38    pub threshold: f64,
39
40    /// Additional ignore patterns for duplication analysis.
41    #[serde(default)]
42    pub ignore: Vec<String>,
43
44    /// Only report cross-directory duplicates.
45    #[serde(default)]
46    pub skip_local: bool,
47
48    /// Enable cross-language clone detection by stripping type annotations.
49    ///
50    /// When enabled, TypeScript type annotations (parameter types, return types,
51    /// generics, interfaces, type aliases) are stripped from the token stream,
52    /// allowing detection of clones between `.ts` and `.js` files.
53    #[serde(default)]
54    pub cross_language: bool,
55
56    /// Fine-grained normalization overrides on top of the detection mode.
57    #[serde(default)]
58    pub normalization: NormalizationConfig,
59}
60
61impl Default for DuplicatesConfig {
62    fn default() -> Self {
63        Self {
64            enabled: true,
65            mode: DetectionMode::default(),
66            min_tokens: default_min_tokens(),
67            min_lines: default_min_lines(),
68            threshold: 0.0,
69            ignore: vec![],
70            skip_local: false,
71            cross_language: false,
72            normalization: NormalizationConfig::default(),
73        }
74    }
75}
76
77/// Fine-grained normalization overrides.
78///
79/// Each option, when set to `Some(true)`, forces that normalization regardless of
80/// the detection mode. When set to `Some(false)`, it forces preservation. When
81/// `None`, the detection mode's default behavior applies.
82#[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)]
83#[serde(rename_all = "camelCase")]
84pub struct NormalizationConfig {
85    /// Blind all identifiers (variable names, function names, etc.) to the same hash.
86    /// Default in `semantic` mode.
87    #[serde(default, skip_serializing_if = "Option::is_none")]
88    pub ignore_identifiers: Option<bool>,
89
90    /// Blind string literal values to the same hash.
91    /// Default in `weak` and `semantic` modes.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub ignore_string_values: Option<bool>,
94
95    /// Blind numeric literal values to the same hash.
96    /// Default in `semantic` mode.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub ignore_numeric_values: Option<bool>,
99}
100
101/// Resolved normalization flags: mode defaults merged with user overrides.
102#[derive(Debug, Clone, Copy, PartialEq, Eq)]
103pub struct ResolvedNormalization {
104    pub ignore_identifiers: bool,
105    pub ignore_string_values: bool,
106    pub ignore_numeric_values: bool,
107}
108
109impl ResolvedNormalization {
110    /// Resolve normalization from a detection mode and optional overrides.
111    pub fn resolve(mode: DetectionMode, overrides: &NormalizationConfig) -> Self {
112        let (default_ids, default_strings, default_numbers) = match mode {
113            DetectionMode::Strict | DetectionMode::Mild => (false, false, false),
114            DetectionMode::Weak => (false, true, false),
115            DetectionMode::Semantic => (true, true, true),
116        };
117
118        Self {
119            ignore_identifiers: overrides.ignore_identifiers.unwrap_or(default_ids),
120            ignore_string_values: overrides.ignore_string_values.unwrap_or(default_strings),
121            ignore_numeric_values: overrides.ignore_numeric_values.unwrap_or(default_numbers),
122        }
123    }
124}
125
126/// Detection mode controlling how aggressively tokens are normalized.
127///
128/// Since fallow uses AST-based tokenization (not lexer-based), whitespace and
129/// comments are inherently absent from the token stream. The `Strict` and `Mild`
130/// modes are currently equivalent. `Weak` mode additionally blinds string
131/// literals. `Semantic` mode blinds all identifiers and literal values for
132/// Type-2 (renamed variable) clone detection.
133#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
134#[serde(rename_all = "lowercase")]
135pub enum DetectionMode {
136    /// All tokens preserved including identifier names and literal values (Type-1 only).
137    Strict,
138    /// Default mode -- equivalent to strict for AST-based tokenization.
139    #[default]
140    Mild,
141    /// Blind string literal values (structure-preserving).
142    Weak,
143    /// Blind all identifiers and literal values for structural (Type-2) detection.
144    Semantic,
145}
146
147impl std::fmt::Display for DetectionMode {
148    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
149        match self {
150            Self::Strict => write!(f, "strict"),
151            Self::Mild => write!(f, "mild"),
152            Self::Weak => write!(f, "weak"),
153            Self::Semantic => write!(f, "semantic"),
154        }
155    }
156}
157
158impl std::str::FromStr for DetectionMode {
159    type Err = String;
160
161    fn from_str(s: &str) -> Result<Self, Self::Err> {
162        match s.to_lowercase().as_str() {
163            "strict" => Ok(Self::Strict),
164            "mild" => Ok(Self::Mild),
165            "weak" => Ok(Self::Weak),
166            "semantic" => Ok(Self::Semantic),
167            other => Err(format!("unknown detection mode: '{other}'")),
168        }
169    }
170}