dupe_core/
ignore_rules.rs

1//! Ignore management for marking acceptable duplicates.
2//!
3//! This module provides functionality to:
4//! - Load and save `.polydup-ignore` files
5//! - Compute content-based IDs for duplicates (SHA256 of normalized tokens)
6//! - Check if a duplicate should be ignored
7//! - Persist ignore decisions across file renames and refactors
8
9use chrono::{DateTime, Utc};
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12use std::collections::HashSet;
13use std::path::{Path, PathBuf};
14
15use crate::error::{PolyDupError, Result};
16
17/// Version of the .polydup-ignore file format
18const IGNORE_FILE_VERSION: u32 = 1;
19
20/// Represents a range within a file (e.g., "src/main.rs:10-25")
21#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
22pub struct FileRange {
23    pub file: PathBuf,
24    pub start_line: usize,
25    pub end_line: usize,
26}
27
28impl std::fmt::Display for FileRange {
29    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30        write!(
31            f,
32            "{}:{}-{}",
33            self.file.display(),
34            self.start_line,
35            self.end_line
36        )
37    }
38}
39
40impl FileRange {
41    /// Parse a file range from a string like "src/main.rs:10-25"
42    pub fn parse(s: &str) -> Result<Self> {
43        let parts: Vec<&str> = s.rsplitn(2, ':').collect();
44        if parts.len() != 2 {
45            return Err(PolyDupError::IgnoreRule(format!(
46                "Invalid file range format: {}",
47                s
48            )));
49        }
50
51        let file = PathBuf::from(parts[1]);
52        let range_parts: Vec<&str> = parts[0].split('-').collect();
53
54        if range_parts.len() != 2 {
55            return Err(PolyDupError::IgnoreRule(format!(
56                "Invalid line range format: {}",
57                s
58            )));
59        }
60
61        let start_line = range_parts[0]
62            .parse()
63            .map_err(|_| PolyDupError::IgnoreRule("Invalid start line number".to_string()))?;
64        let end_line = range_parts[1]
65            .parse()
66            .map_err(|_| PolyDupError::IgnoreRule("Invalid end line number".to_string()))?;
67
68        if start_line > end_line {
69            return Err(PolyDupError::IgnoreRule(format!(
70                "Start line ({}) must be <= end line ({})",
71                start_line, end_line
72            )));
73        }
74
75        Ok(FileRange {
76            file,
77            start_line,
78            end_line,
79        })
80    }
81}
82
83/// A single ignore entry representing an acceptable duplicate
84#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
85pub struct IgnoreEntry {
86    /// Content-based ID (SHA256 of normalized token sequence)
87    /// This ensures ignores persist across file renames and whitespace changes
88    pub id: String,
89
90    /// Files and line ranges where this duplicate appears
91    pub files: Vec<FileRange>,
92
93    /// Human-readable reason for ignoring this duplicate
94    pub reason: String,
95
96    /// User who added this ignore (email or username)
97    pub added_by: String,
98
99    /// Timestamp when this ignore was added
100    pub added_at: DateTime<Utc>,
101}
102
103impl IgnoreEntry {
104    /// Create a new ignore entry with the current timestamp
105    pub fn new(id: String, files: Vec<FileRange>, reason: String, added_by: String) -> Self {
106        Self {
107            id,
108            files,
109            reason,
110            added_by,
111            added_at: Utc::now(),
112        }
113    }
114
115    /// Check if this ignore entry matches the given duplicate ID
116    pub fn matches_id(&self, duplicate_id: &str) -> bool {
117        self.id == duplicate_id
118    }
119}
120
121/// Container for the .polydup-ignore file format
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct IgnoreFile {
124    pub version: u32,
125    pub ignores: Vec<IgnoreEntry>,
126}
127
128impl Default for IgnoreFile {
129    fn default() -> Self {
130        Self {
131            version: IGNORE_FILE_VERSION,
132            ignores: Vec::new(),
133        }
134    }
135}
136
137/// Manages loading, saving, and querying ignore entries
138pub struct IgnoreManager {
139    ignore_file_path: PathBuf,
140    ignore_file: IgnoreFile,
141    ignored_ids: HashSet<String>,
142}
143
144impl IgnoreManager {
145    /// Create a new IgnoreManager for the given directory
146    pub fn new(directory: &Path) -> Self {
147        let ignore_file_path = directory.join(".polydup-ignore");
148        Self {
149            ignore_file_path,
150            ignore_file: IgnoreFile::default(),
151            ignored_ids: HashSet::new(),
152        }
153    }
154
155    /// Load the .polydup-ignore file if it exists
156    pub fn load(&mut self) -> Result<()> {
157        if !self.ignore_file_path.exists() {
158            // No ignore file is not an error - just means nothing is ignored
159            return Ok(());
160        }
161
162        let contents = std::fs::read_to_string(&self.ignore_file_path).map_err(PolyDupError::Io)?;
163
164        self.ignore_file = toml::from_str(&contents).map_err(|e| {
165            PolyDupError::Parsing(format!("Failed to parse .polydup-ignore file: {}", e))
166        })?;
167
168        // Validate version
169        if self.ignore_file.version > IGNORE_FILE_VERSION {
170            return Err(PolyDupError::Config(format!(
171                "Unsupported .polydup-ignore version: {} (expected <= {})",
172                self.ignore_file.version, IGNORE_FILE_VERSION
173            )));
174        }
175
176        // Build lookup set for fast ID checks
177        self.ignored_ids = self
178            .ignore_file
179            .ignores
180            .iter()
181            .map(|entry| entry.id.clone())
182            .collect();
183
184        Ok(())
185    }
186
187    /// Save the current ignore entries to .polydup-ignore
188    pub fn save(&self) -> Result<()> {
189        let contents = toml::to_string_pretty(&self.ignore_file).map_err(|e| {
190            PolyDupError::Parsing(format!("Failed to serialize ignore file: {}", e))
191        })?;
192
193        std::fs::write(&self.ignore_file_path, contents).map_err(PolyDupError::Io)?;
194
195        Ok(())
196    }
197
198    /// Check if a duplicate with the given ID should be ignored
199    pub fn is_ignored(&self, duplicate_id: &str) -> bool {
200        self.ignored_ids.contains(duplicate_id)
201    }
202
203    /// Add a new ignore entry
204    pub fn add_ignore(&mut self, entry: IgnoreEntry) {
205        self.ignored_ids.insert(entry.id.clone());
206        self.ignore_file.ignores.push(entry);
207    }
208
209    /// Remove an ignore entry by ID
210    pub fn remove_ignore(&mut self, duplicate_id: &str) -> bool {
211        if let Some(pos) = self
212            .ignore_file
213            .ignores
214            .iter()
215            .position(|e| e.id == duplicate_id)
216        {
217            self.ignore_file.ignores.remove(pos);
218            self.ignored_ids.remove(duplicate_id);
219            true
220        } else {
221            false
222        }
223    }
224
225    /// Get all ignore entries
226    pub fn list_ignores(&self) -> &[IgnoreEntry] {
227        &self.ignore_file.ignores
228    }
229
230    /// Get the number of ignored duplicates
231    pub fn count(&self) -> usize {
232        self.ignore_file.ignores.len()
233    }
234}
235
236/// Compute a content-based ID for a duplicate
237///
238/// This uses SHA256 of the normalized token sequence, ensuring:
239/// - Ignores survive file renames
240/// - Ignores survive whitespace/comment changes
241/// - Two identical code blocks get the same ID
242pub fn compute_duplicate_id(normalized_tokens: &[String]) -> String {
243    let mut hasher = Sha256::new();
244
245    // Hash the concatenated normalized tokens
246    for token in normalized_tokens {
247        hasher.update(token.as_bytes());
248        hasher.update(b"\n"); // Separator to avoid collisions
249    }
250
251    let result = hasher.finalize();
252    format!("sha256:{}", hex::encode(result))
253}
254
255/// Compute a symmetric ID for a pair of token windows.
256///
257/// When both sides normalize to the same token sequence (Type-1/2 clones),
258/// this returns the legacy single-window ID to keep existing ignore files valid.
259pub fn compute_symmetric_duplicate_id(
260    normalized_tokens1: &[String],
261    normalized_tokens2: &[String],
262) -> String {
263    let id1 = compute_duplicate_id(normalized_tokens1);
264    let id2 = compute_duplicate_id(normalized_tokens2);
265
266    // Preserve legacy IDs when both windows hash the same (or collide)
267    if id1 == id2 {
268        return id1;
269    }
270
271    let (first, second) = if id1 <= id2 { (id1, id2) } else { (id2, id1) };
272
273    let mut hasher = Sha256::new();
274    hasher.update(first.as_bytes());
275    hasher.update(b"\n");
276    hasher.update(second.as_bytes());
277
278    let result = hasher.finalize();
279    format!("sha256:{}", hex::encode(result))
280}
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    #[test]
287    fn test_file_range_parse() {
288        let range = FileRange::parse("src/main.rs:10-25").unwrap();
289        assert_eq!(range.file, PathBuf::from("src/main.rs"));
290        assert_eq!(range.start_line, 10);
291        assert_eq!(range.end_line, 25);
292    }
293
294    #[test]
295    fn test_file_range_parse_invalid() {
296        assert!(FileRange::parse("invalid").is_err());
297        assert!(FileRange::parse("src/main.rs").is_err());
298        assert!(FileRange::parse("src/main.rs:10").is_err());
299        assert!(FileRange::parse("src/main.rs:25-10").is_err()); // start > end
300    }
301
302    #[test]
303    fn test_file_range_display() {
304        let range = FileRange {
305            file: PathBuf::from("src/lib.rs"),
306            start_line: 5,
307            end_line: 15,
308        };
309        assert_eq!(range.to_string(), "src/lib.rs:5-15");
310    }
311
312    #[test]
313    fn test_compute_duplicate_id() {
314        let tokens1 = vec!["fn".to_string(), "$$ID".to_string(), "$$NUM".to_string()];
315        let tokens2 = vec!["fn".to_string(), "$$ID".to_string(), "$$NUM".to_string()];
316        let tokens3 = vec!["fn".to_string(), "$$ID".to_string(), "$$STR".to_string()];
317
318        let id1 = compute_duplicate_id(&tokens1);
319        let id2 = compute_duplicate_id(&tokens2);
320        let id3 = compute_duplicate_id(&tokens3);
321
322        assert_eq!(id1, id2, "Same tokens should produce same ID");
323        assert_ne!(id1, id3, "Different tokens should produce different IDs");
324        assert!(id1.starts_with("sha256:"), "ID should have sha256 prefix");
325    }
326
327    #[test]
328    fn test_compute_duplicate_id_symmetric_same_tokens() {
329        let tokens = vec!["a".to_string(), "b".to_string()];
330
331        let symmetric = compute_symmetric_duplicate_id(&tokens, &tokens);
332        let single = compute_duplicate_id(&tokens);
333
334        assert_eq!(
335            symmetric, single,
336            "Symmetric ID should match legacy ID when windows are identical"
337        );
338    }
339
340    #[test]
341    fn test_compute_duplicate_id_symmetric_order_independent() {
342        let tokens_a = vec!["a".to_string(), "b".to_string(), "c".to_string()];
343        let tokens_b = vec![
344            "a".to_string(),
345            "b".to_string(),
346            "c".to_string(),
347            "d".to_string(),
348        ];
349
350        let id1 = compute_symmetric_duplicate_id(&tokens_a, &tokens_b);
351        let id2 = compute_symmetric_duplicate_id(&tokens_b, &tokens_a);
352
353        assert_eq!(id1, id2, "Symmetric ID should ignore argument order");
354        assert_ne!(
355            id1,
356            compute_duplicate_id(&tokens_a),
357            "Should incorporate both windows when they differ"
358        );
359    }
360
361    #[test]
362    fn test_ignore_entry_creation() {
363        let files = vec![FileRange {
364            file: PathBuf::from("src/main.rs"),
365            start_line: 1,
366            end_line: 10,
367        }];
368
369        let entry = IgnoreEntry::new(
370            "sha256:abc123".to_string(),
371            files.clone(),
372            "License header".to_string(),
373            "user@example.com".to_string(),
374        );
375
376        assert_eq!(entry.id, "sha256:abc123");
377        assert_eq!(entry.files, files);
378        assert_eq!(entry.reason, "License header");
379        assert_eq!(entry.added_by, "user@example.com");
380    }
381
382    #[test]
383    fn test_ignore_manager_basic() {
384        let temp_dir = std::env::temp_dir();
385        let mut manager = IgnoreManager::new(&temp_dir);
386
387        // Initially no ignores
388        assert_eq!(manager.count(), 0);
389        assert!(!manager.is_ignored("sha256:test"));
390
391        // Add an ignore
392        let entry = IgnoreEntry::new(
393            "sha256:test".to_string(),
394            vec![],
395            "Test".to_string(),
396            "test@example.com".to_string(),
397        );
398        manager.add_ignore(entry);
399
400        assert_eq!(manager.count(), 1);
401        assert!(manager.is_ignored("sha256:test"));
402        assert!(!manager.is_ignored("sha256:other"));
403
404        // Remove the ignore
405        assert!(manager.remove_ignore("sha256:test"));
406        assert_eq!(manager.count(), 0);
407        assert!(!manager.is_ignored("sha256:test"));
408    }
409
410    #[test]
411    fn test_ignore_manager_remove_nonexistent() {
412        let temp_dir = std::env::temp_dir();
413        let mut manager = IgnoreManager::new(&temp_dir);
414
415        assert!(!manager.remove_ignore("sha256:nonexistent"));
416    }
417}