Skip to main content

dupe_core/
ignore_rules.rs

1//! Ignore management for marking acceptable duplicates.
2//!
3//! This module provides functionality to:
4//! - Load and save `.polydup-ignore` files
5//! - Compute content-based IDs for duplicates (SHA256 of normalized tokens)
6//! - Check if a duplicate should be ignored
7//! - Persist ignore decisions across file renames and refactors
8
9use chrono::{DateTime, Utc};
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12use std::collections::HashSet;
13use std::path::{Path, PathBuf};
14
15use crate::error::{PolyDupError, Result};
16
17/// Version of the .polydup-ignore file format
18const IGNORE_FILE_VERSION: u32 = 1;
19
20/// Represents a range within a file (e.g., "src/main.rs:10-25")
21#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
22pub struct FileRange {
23    pub file: PathBuf,
24    pub start_line: usize,
25    pub end_line: usize,
26}
27
28impl std::fmt::Display for FileRange {
29    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30        write!(
31            f,
32            "{}:{}-{}",
33            self.file.display(),
34            self.start_line,
35            self.end_line
36        )
37    }
38}
39
40impl FileRange {
41    /// Parse a file range from a string like "src/main.rs:10-25"
42    pub fn parse(s: &str) -> Result<Self> {
43        let parts: Vec<&str> = s.rsplitn(2, ':').collect();
44        if parts.len() != 2 {
45            return Err(PolyDupError::IgnoreRule(format!(
46                "Invalid file range format: {}",
47                s
48            )));
49        }
50
51        let file = PathBuf::from(parts[1]);
52        let range_parts: Vec<&str> = parts[0].split('-').collect();
53
54        if range_parts.len() != 2 {
55            return Err(PolyDupError::IgnoreRule(format!(
56                "Invalid line range format: {}",
57                s
58            )));
59        }
60
61        let start_line = range_parts[0]
62            .parse()
63            .map_err(|_| PolyDupError::IgnoreRule("Invalid start line number".to_string()))?;
64        let end_line = range_parts[1]
65            .parse()
66            .map_err(|_| PolyDupError::IgnoreRule("Invalid end line number".to_string()))?;
67
68        if start_line > end_line {
69            return Err(PolyDupError::IgnoreRule(format!(
70                "Start line ({}) must be <= end line ({})",
71                start_line, end_line
72            )));
73        }
74
75        Ok(FileRange {
76            file,
77            start_line,
78            end_line,
79        })
80    }
81}
82
83/// A single ignore entry representing an acceptable duplicate
84#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
85pub struct IgnoreEntry {
86    /// Content-based ID (SHA256 of normalized token sequence)
87    /// This ensures ignores persist across file renames and whitespace changes
88    pub id: String,
89
90    /// Files and line ranges where this duplicate appears
91    pub files: Vec<FileRange>,
92
93    /// Human-readable reason for ignoring this duplicate
94    pub reason: String,
95
96    /// User who added this ignore (email or username)
97    pub added_by: String,
98
99    /// Timestamp when this ignore was added
100    pub added_at: DateTime<Utc>,
101}
102
103impl IgnoreEntry {
104    /// Create a new ignore entry with the current timestamp
105    pub fn new(id: String, files: Vec<FileRange>, reason: String, added_by: String) -> Self {
106        Self {
107            id,
108            files,
109            reason,
110            added_by,
111            added_at: Utc::now(),
112        }
113    }
114
115    /// Check if this ignore entry matches the given duplicate ID
116    pub fn matches_id(&self, duplicate_id: &str) -> bool {
117        self.id == duplicate_id
118    }
119}
120
121/// Container for the .polydup-ignore file format
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct IgnoreFile {
124    pub version: u32,
125    pub ignores: Vec<IgnoreEntry>,
126}
127
128impl Default for IgnoreFile {
129    fn default() -> Self {
130        Self {
131            version: IGNORE_FILE_VERSION,
132            ignores: Vec::new(),
133        }
134    }
135}
136
137/// Manages loading, saving, and querying ignore entries
138pub struct IgnoreManager {
139    ignore_file_path: PathBuf,
140    ignore_file: IgnoreFile,
141    ignored_ids: HashSet<String>,
142}
143
144impl IgnoreManager {
145    /// Create a new IgnoreManager for the given directory
146    pub fn new(directory: &Path) -> Self {
147        let ignore_file_path = directory.join(".polydup-ignore");
148        Self {
149            ignore_file_path,
150            ignore_file: IgnoreFile::default(),
151            ignored_ids: HashSet::new(),
152        }
153    }
154
155    /// Load the .polydup-ignore file if it exists
156    pub fn load(&mut self) -> Result<()> {
157        if !self.ignore_file_path.exists() {
158            // No ignore file is not an error - just means nothing is ignored
159            return Ok(());
160        }
161
162        let contents = std::fs::read_to_string(&self.ignore_file_path).map_err(PolyDupError::Io)?;
163
164        self.ignore_file = toml::from_str(&contents).map_err(|e| {
165            PolyDupError::Parsing(format!("Failed to parse .polydup-ignore file: {}", e))
166        })?;
167
168        // Validate version
169        if self.ignore_file.version > IGNORE_FILE_VERSION {
170            return Err(PolyDupError::Config(format!(
171                "Unsupported .polydup-ignore version: {} (expected <= {})",
172                self.ignore_file.version, IGNORE_FILE_VERSION
173            )));
174        }
175
176        // Build lookup set for fast ID checks
177        self.ignored_ids = self
178            .ignore_file
179            .ignores
180            .iter()
181            .map(|entry| entry.id.clone())
182            .collect();
183
184        Ok(())
185    }
186
187    /// Save the current ignore entries to .polydup-ignore
188    pub fn save(&self) -> Result<()> {
189        let contents = toml::to_string_pretty(&self.ignore_file).map_err(|e| {
190            PolyDupError::Parsing(format!("Failed to serialize ignore file: {}", e))
191        })?;
192
193        std::fs::write(&self.ignore_file_path, contents).map_err(PolyDupError::Io)?;
194
195        Ok(())
196    }
197
198    /// Check if a duplicate with the given ID should be ignored
199    pub fn is_ignored(&self, duplicate_id: &str) -> bool {
200        self.ignored_ids.contains(duplicate_id)
201    }
202
203    /// Add a new ignore entry
204    pub fn add_ignore(&mut self, entry: IgnoreEntry) {
205        self.ignored_ids.insert(entry.id.clone());
206        self.ignore_file.ignores.push(entry);
207    }
208
209    /// Remove an ignore entry by ID (supports prefix matching for short IDs)
210    pub fn remove_ignore(&mut self, duplicate_id: &str) -> bool {
211        // First try exact match
212        if let Some(pos) = self
213            .ignore_file
214            .ignores
215            .iter()
216            .position(|e| e.id == duplicate_id)
217        {
218            self.ignore_file.ignores.remove(pos);
219            self.ignored_ids.remove(duplicate_id);
220            return true;
221        }
222
223        // Try prefix match for short IDs
224        if let Some(full_id) = self.find_unique_by_prefix(duplicate_id) {
225            if let Some(pos) = self
226                .ignore_file
227                .ignores
228                .iter()
229                .position(|e| e.id == full_id)
230            {
231                self.ignore_file.ignores.remove(pos);
232                self.ignored_ids.remove(&full_id);
233                return true;
234            }
235        }
236
237        false
238    }
239
240    /// Find a unique ID by prefix (for short ID matching)
241    ///
242    /// Returns:
243    /// - `Some(full_id)` if exactly one ID matches the prefix
244    /// - `None` if no IDs match or multiple IDs match (ambiguous)
245    pub fn find_unique_by_prefix(&self, prefix: &str) -> Option<String> {
246        let matches: Vec<&String> = self
247            .ignored_ids
248            .iter()
249            .filter(|id| id.contains(prefix) || id.ends_with(prefix))
250            .collect();
251
252        if matches.len() == 1 {
253            Some(matches[0].clone())
254        } else {
255            None
256        }
257    }
258
259    /// Find all IDs matching a prefix (for displaying ambiguous matches)
260    pub fn find_all_by_prefix(&self, prefix: &str) -> Vec<String> {
261        self.ignored_ids
262            .iter()
263            .filter(|id| id.contains(prefix) || id.ends_with(prefix))
264            .cloned()
265            .collect()
266    }
267
268    /// Get all ignore entries
269    pub fn list_ignores(&self) -> &[IgnoreEntry] {
270        &self.ignore_file.ignores
271    }
272
273    /// Get the number of ignored duplicates
274    pub fn count(&self) -> usize {
275        self.ignore_file.ignores.len()
276    }
277}
278
279/// Compute a content-based ID for a duplicate
280///
281/// This uses SHA256 of the normalized token sequence, ensuring:
282/// - Ignores survive file renames
283/// - Ignores survive whitespace/comment changes
284/// - Two identical code blocks get the same ID
285pub fn compute_duplicate_id(normalized_tokens: &[String]) -> String {
286    let mut hasher = Sha256::new();
287
288    // Hash the concatenated normalized tokens
289    for token in normalized_tokens {
290        hasher.update(token.as_bytes());
291        hasher.update(b"\n"); // Separator to avoid collisions
292    }
293
294    let result = hasher.finalize();
295    format!("sha256:{}", hex::encode(result))
296}
297
298/// Compute a symmetric ID for a pair of token windows.
299///
300/// When both sides normalize to the same token sequence (Type-1/2 clones),
301/// this returns the legacy single-window ID to keep existing ignore files valid.
302pub fn compute_symmetric_duplicate_id(
303    normalized_tokens1: &[String],
304    normalized_tokens2: &[String],
305) -> String {
306    let id1 = compute_duplicate_id(normalized_tokens1);
307    let id2 = compute_duplicate_id(normalized_tokens2);
308
309    // Preserve legacy IDs when both windows hash the same (or collide)
310    if id1 == id2 {
311        return id1;
312    }
313
314    let (first, second) = if id1 <= id2 { (id1, id2) } else { (id2, id1) };
315
316    let mut hasher = Sha256::new();
317    hasher.update(first.as_bytes());
318    hasher.update(b"\n");
319    hasher.update(second.as_bytes());
320
321    let result = hasher.finalize();
322    format!("sha256:{}", hex::encode(result))
323}
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328
329    #[test]
330    fn test_file_range_parse() {
331        let range = FileRange::parse("src/main.rs:10-25").unwrap();
332        assert_eq!(range.file, PathBuf::from("src/main.rs"));
333        assert_eq!(range.start_line, 10);
334        assert_eq!(range.end_line, 25);
335    }
336
337    #[test]
338    fn test_file_range_parse_invalid() {
339        assert!(FileRange::parse("invalid").is_err());
340        assert!(FileRange::parse("src/main.rs").is_err());
341        assert!(FileRange::parse("src/main.rs:10").is_err());
342        assert!(FileRange::parse("src/main.rs:25-10").is_err()); // start > end
343    }
344
345    #[test]
346    fn test_file_range_display() {
347        let range = FileRange {
348            file: PathBuf::from("src/lib.rs"),
349            start_line: 5,
350            end_line: 15,
351        };
352        assert_eq!(range.to_string(), "src/lib.rs:5-15");
353    }
354
355    #[test]
356    fn test_compute_duplicate_id() {
357        let tokens1 = vec!["fn".to_string(), "$$ID".to_string(), "$$NUM".to_string()];
358        let tokens2 = vec!["fn".to_string(), "$$ID".to_string(), "$$NUM".to_string()];
359        let tokens3 = vec!["fn".to_string(), "$$ID".to_string(), "$$STR".to_string()];
360
361        let id1 = compute_duplicate_id(&tokens1);
362        let id2 = compute_duplicate_id(&tokens2);
363        let id3 = compute_duplicate_id(&tokens3);
364
365        assert_eq!(id1, id2, "Same tokens should produce same ID");
366        assert_ne!(id1, id3, "Different tokens should produce different IDs");
367        assert!(id1.starts_with("sha256:"), "ID should have sha256 prefix");
368    }
369
370    #[test]
371    fn test_compute_duplicate_id_symmetric_same_tokens() {
372        let tokens = vec!["a".to_string(), "b".to_string()];
373
374        let symmetric = compute_symmetric_duplicate_id(&tokens, &tokens);
375        let single = compute_duplicate_id(&tokens);
376
377        assert_eq!(
378            symmetric, single,
379            "Symmetric ID should match legacy ID when windows are identical"
380        );
381    }
382
383    #[test]
384    fn test_compute_duplicate_id_symmetric_order_independent() {
385        let tokens_a = vec!["a".to_string(), "b".to_string(), "c".to_string()];
386        let tokens_b = vec![
387            "a".to_string(),
388            "b".to_string(),
389            "c".to_string(),
390            "d".to_string(),
391        ];
392
393        let id1 = compute_symmetric_duplicate_id(&tokens_a, &tokens_b);
394        let id2 = compute_symmetric_duplicate_id(&tokens_b, &tokens_a);
395
396        assert_eq!(id1, id2, "Symmetric ID should ignore argument order");
397        assert_ne!(
398            id1,
399            compute_duplicate_id(&tokens_a),
400            "Should incorporate both windows when they differ"
401        );
402    }
403
404    #[test]
405    fn test_ignore_entry_creation() {
406        let files = vec![FileRange {
407            file: PathBuf::from("src/main.rs"),
408            start_line: 1,
409            end_line: 10,
410        }];
411
412        let entry = IgnoreEntry::new(
413            "sha256:abc123".to_string(),
414            files.clone(),
415            "License header".to_string(),
416            "user@example.com".to_string(),
417        );
418
419        assert_eq!(entry.id, "sha256:abc123");
420        assert_eq!(entry.files, files);
421        assert_eq!(entry.reason, "License header");
422        assert_eq!(entry.added_by, "user@example.com");
423    }
424
425    #[test]
426    fn test_ignore_manager_basic() {
427        let temp_dir = std::env::temp_dir();
428        let mut manager = IgnoreManager::new(&temp_dir);
429
430        // Initially no ignores
431        assert_eq!(manager.count(), 0);
432        assert!(!manager.is_ignored("sha256:test"));
433
434        // Add an ignore
435        let entry = IgnoreEntry::new(
436            "sha256:test".to_string(),
437            vec![],
438            "Test".to_string(),
439            "test@example.com".to_string(),
440        );
441        manager.add_ignore(entry);
442
443        assert_eq!(manager.count(), 1);
444        assert!(manager.is_ignored("sha256:test"));
445        assert!(!manager.is_ignored("sha256:other"));
446
447        // Remove the ignore
448        assert!(manager.remove_ignore("sha256:test"));
449        assert_eq!(manager.count(), 0);
450        assert!(!manager.is_ignored("sha256:test"));
451    }
452
453    #[test]
454    fn test_ignore_manager_remove_nonexistent() {
455        let temp_dir = std::env::temp_dir();
456        let mut manager = IgnoreManager::new(&temp_dir);
457
458        assert!(!manager.remove_ignore("sha256:nonexistent"));
459    }
460}