dupe_core/
ignore_rules.rs

1//! Ignore management for marking acceptable duplicates.
2//!
3//! This module provides functionality to:
4//! - Load and save `.polydup-ignore` files
5//! - Compute content-based IDs for duplicates (SHA256 of normalized tokens)
6//! - Check if a duplicate should be ignored
7//! - Persist ignore decisions across file renames and refactors
8
9use anyhow::{Context, Result};
10use chrono::{DateTime, Utc};
11use serde::{Deserialize, Serialize};
12use sha2::{Digest, Sha256};
13use std::collections::HashSet;
14use std::path::{Path, PathBuf};
15
16/// Version of the .polydup-ignore file format
17const IGNORE_FILE_VERSION: u32 = 1;
18
19/// Represents a range within a file (e.g., "src/main.rs:10-25")
20#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21pub struct FileRange {
22    pub file: PathBuf,
23    pub start_line: usize,
24    pub end_line: usize,
25}
26
27impl std::fmt::Display for FileRange {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        write!(
30            f,
31            "{}:{}-{}",
32            self.file.display(),
33            self.start_line,
34            self.end_line
35        )
36    }
37}
38
39impl FileRange {
40    /// Parse a file range from a string like "src/main.rs:10-25"
41    pub fn parse(s: &str) -> Result<Self> {
42        let parts: Vec<&str> = s.rsplitn(2, ':').collect();
43        if parts.len() != 2 {
44            anyhow::bail!("Invalid file range format: {}", s);
45        }
46
47        let file = PathBuf::from(parts[1]);
48        let range_parts: Vec<&str> = parts[0].split('-').collect();
49
50        if range_parts.len() != 2 {
51            anyhow::bail!("Invalid line range format: {}", s);
52        }
53
54        let start_line = range_parts[0]
55            .parse()
56            .context("Invalid start line number")?;
57        let end_line = range_parts[1].parse().context("Invalid end line number")?;
58
59        if start_line > end_line {
60            anyhow::bail!(
61                "Start line ({}) must be <= end line ({})",
62                start_line,
63                end_line
64            );
65        }
66
67        Ok(FileRange {
68            file,
69            start_line,
70            end_line,
71        })
72    }
73}
74
75/// A single ignore entry representing an acceptable duplicate
76#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
77pub struct IgnoreEntry {
78    /// Content-based ID (SHA256 of normalized token sequence)
79    /// This ensures ignores persist across file renames and whitespace changes
80    pub id: String,
81
82    /// Files and line ranges where this duplicate appears
83    pub files: Vec<FileRange>,
84
85    /// Human-readable reason for ignoring this duplicate
86    pub reason: String,
87
88    /// User who added this ignore (email or username)
89    pub added_by: String,
90
91    /// Timestamp when this ignore was added
92    pub added_at: DateTime<Utc>,
93}
94
95impl IgnoreEntry {
96    /// Create a new ignore entry with the current timestamp
97    pub fn new(id: String, files: Vec<FileRange>, reason: String, added_by: String) -> Self {
98        Self {
99            id,
100            files,
101            reason,
102            added_by,
103            added_at: Utc::now(),
104        }
105    }
106
107    /// Check if this ignore entry matches the given duplicate ID
108    pub fn matches_id(&self, duplicate_id: &str) -> bool {
109        self.id == duplicate_id
110    }
111}
112
113/// Container for the .polydup-ignore file format
114#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct IgnoreFile {
116    pub version: u32,
117    pub ignores: Vec<IgnoreEntry>,
118}
119
120impl Default for IgnoreFile {
121    fn default() -> Self {
122        Self {
123            version: IGNORE_FILE_VERSION,
124            ignores: Vec::new(),
125        }
126    }
127}
128
129/// Manages loading, saving, and querying ignore entries
130pub struct IgnoreManager {
131    ignore_file_path: PathBuf,
132    ignore_file: IgnoreFile,
133    ignored_ids: HashSet<String>,
134}
135
136impl IgnoreManager {
137    /// Create a new IgnoreManager for the given directory
138    pub fn new(directory: &Path) -> Self {
139        let ignore_file_path = directory.join(".polydup-ignore");
140        Self {
141            ignore_file_path,
142            ignore_file: IgnoreFile::default(),
143            ignored_ids: HashSet::new(),
144        }
145    }
146
147    /// Load the .polydup-ignore file if it exists
148    pub fn load(&mut self) -> Result<()> {
149        if !self.ignore_file_path.exists() {
150            // No ignore file is not an error - just means nothing is ignored
151            return Ok(());
152        }
153
154        let contents = std::fs::read_to_string(&self.ignore_file_path)
155            .context("Failed to read .polydup-ignore file")?;
156
157        self.ignore_file =
158            toml::from_str(&contents).context("Failed to parse .polydup-ignore file")?;
159
160        // Validate version
161        if self.ignore_file.version > IGNORE_FILE_VERSION {
162            anyhow::bail!(
163                "Unsupported .polydup-ignore version: {} (expected <= {})",
164                self.ignore_file.version,
165                IGNORE_FILE_VERSION
166            );
167        }
168
169        // Build lookup set for fast ID checks
170        self.ignored_ids = self
171            .ignore_file
172            .ignores
173            .iter()
174            .map(|entry| entry.id.clone())
175            .collect();
176
177        Ok(())
178    }
179
180    /// Save the current ignore entries to .polydup-ignore
181    pub fn save(&self) -> Result<()> {
182        let contents =
183            toml::to_string_pretty(&self.ignore_file).context("Failed to serialize ignore file")?;
184
185        std::fs::write(&self.ignore_file_path, contents)
186            .context("Failed to write .polydup-ignore file")?;
187
188        Ok(())
189    }
190
191    /// Check if a duplicate with the given ID should be ignored
192    pub fn is_ignored(&self, duplicate_id: &str) -> bool {
193        self.ignored_ids.contains(duplicate_id)
194    }
195
196    /// Add a new ignore entry
197    pub fn add_ignore(&mut self, entry: IgnoreEntry) {
198        self.ignored_ids.insert(entry.id.clone());
199        self.ignore_file.ignores.push(entry);
200    }
201
202    /// Remove an ignore entry by ID
203    pub fn remove_ignore(&mut self, duplicate_id: &str) -> bool {
204        if let Some(pos) = self
205            .ignore_file
206            .ignores
207            .iter()
208            .position(|e| e.id == duplicate_id)
209        {
210            self.ignore_file.ignores.remove(pos);
211            self.ignored_ids.remove(duplicate_id);
212            true
213        } else {
214            false
215        }
216    }
217
218    /// Get all ignore entries
219    pub fn list_ignores(&self) -> &[IgnoreEntry] {
220        &self.ignore_file.ignores
221    }
222
223    /// Get the number of ignored duplicates
224    pub fn count(&self) -> usize {
225        self.ignore_file.ignores.len()
226    }
227}
228
229/// Compute a content-based ID for a duplicate
230///
231/// This uses SHA256 of the normalized token sequence, ensuring:
232/// - Ignores survive file renames
233/// - Ignores survive whitespace/comment changes
234/// - Two identical code blocks get the same ID
235pub fn compute_duplicate_id(normalized_tokens: &[String]) -> String {
236    let mut hasher = Sha256::new();
237
238    // Hash the concatenated normalized tokens
239    for token in normalized_tokens {
240        hasher.update(token.as_bytes());
241        hasher.update(b"\n"); // Separator to avoid collisions
242    }
243
244    let result = hasher.finalize();
245    format!("sha256:{}", hex::encode(result))
246}
247
248/// Compute a symmetric ID for a pair of token windows.
249///
250/// When both sides normalize to the same token sequence (Type-1/2 clones),
251/// this returns the legacy single-window ID to keep existing ignore files valid.
252pub fn compute_symmetric_duplicate_id(
253    normalized_tokens1: &[String],
254    normalized_tokens2: &[String],
255) -> String {
256    let id1 = compute_duplicate_id(normalized_tokens1);
257    let id2 = compute_duplicate_id(normalized_tokens2);
258
259    // Preserve legacy IDs when both windows hash the same (or collide)
260    if id1 == id2 {
261        return id1;
262    }
263
264    let (first, second) = if id1 <= id2 { (id1, id2) } else { (id2, id1) };
265
266    let mut hasher = Sha256::new();
267    hasher.update(first.as_bytes());
268    hasher.update(b"\n");
269    hasher.update(second.as_bytes());
270
271    let result = hasher.finalize();
272    format!("sha256:{}", hex::encode(result))
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_file_range_parse() {
281        let range = FileRange::parse("src/main.rs:10-25").unwrap();
282        assert_eq!(range.file, PathBuf::from("src/main.rs"));
283        assert_eq!(range.start_line, 10);
284        assert_eq!(range.end_line, 25);
285    }
286
287    #[test]
288    fn test_file_range_parse_invalid() {
289        assert!(FileRange::parse("invalid").is_err());
290        assert!(FileRange::parse("src/main.rs").is_err());
291        assert!(FileRange::parse("src/main.rs:10").is_err());
292        assert!(FileRange::parse("src/main.rs:25-10").is_err()); // start > end
293    }
294
295    #[test]
296    fn test_file_range_display() {
297        let range = FileRange {
298            file: PathBuf::from("src/lib.rs"),
299            start_line: 5,
300            end_line: 15,
301        };
302        assert_eq!(range.to_string(), "src/lib.rs:5-15");
303    }
304
305    #[test]
306    fn test_compute_duplicate_id() {
307        let tokens1 = vec!["fn".to_string(), "$$ID".to_string(), "$$NUM".to_string()];
308        let tokens2 = vec!["fn".to_string(), "$$ID".to_string(), "$$NUM".to_string()];
309        let tokens3 = vec!["fn".to_string(), "$$ID".to_string(), "$$STR".to_string()];
310
311        let id1 = compute_duplicate_id(&tokens1);
312        let id2 = compute_duplicate_id(&tokens2);
313        let id3 = compute_duplicate_id(&tokens3);
314
315        assert_eq!(id1, id2, "Same tokens should produce same ID");
316        assert_ne!(id1, id3, "Different tokens should produce different IDs");
317        assert!(id1.starts_with("sha256:"), "ID should have sha256 prefix");
318    }
319
320    #[test]
321    fn test_compute_duplicate_id_symmetric_same_tokens() {
322        let tokens = vec!["a".to_string(), "b".to_string()];
323
324        let symmetric = compute_symmetric_duplicate_id(&tokens, &tokens);
325        let single = compute_duplicate_id(&tokens);
326
327        assert_eq!(
328            symmetric, single,
329            "Symmetric ID should match legacy ID when windows are identical"
330        );
331    }
332
333    #[test]
334    fn test_compute_duplicate_id_symmetric_order_independent() {
335        let tokens_a = vec!["a".to_string(), "b".to_string(), "c".to_string()];
336        let tokens_b = vec![
337            "a".to_string(),
338            "b".to_string(),
339            "c".to_string(),
340            "d".to_string(),
341        ];
342
343        let id1 = compute_symmetric_duplicate_id(&tokens_a, &tokens_b);
344        let id2 = compute_symmetric_duplicate_id(&tokens_b, &tokens_a);
345
346        assert_eq!(id1, id2, "Symmetric ID should ignore argument order");
347        assert_ne!(
348            id1,
349            compute_duplicate_id(&tokens_a),
350            "Should incorporate both windows when they differ"
351        );
352    }
353
354    #[test]
355    fn test_ignore_entry_creation() {
356        let files = vec![FileRange {
357            file: PathBuf::from("src/main.rs"),
358            start_line: 1,
359            end_line: 10,
360        }];
361
362        let entry = IgnoreEntry::new(
363            "sha256:abc123".to_string(),
364            files.clone(),
365            "License header".to_string(),
366            "user@example.com".to_string(),
367        );
368
369        assert_eq!(entry.id, "sha256:abc123");
370        assert_eq!(entry.files, files);
371        assert_eq!(entry.reason, "License header");
372        assert_eq!(entry.added_by, "user@example.com");
373    }
374
375    #[test]
376    fn test_ignore_manager_basic() {
377        let temp_dir = std::env::temp_dir();
378        let mut manager = IgnoreManager::new(&temp_dir);
379
380        // Initially no ignores
381        assert_eq!(manager.count(), 0);
382        assert!(!manager.is_ignored("sha256:test"));
383
384        // Add an ignore
385        let entry = IgnoreEntry::new(
386            "sha256:test".to_string(),
387            vec![],
388            "Test".to_string(),
389            "test@example.com".to_string(),
390        );
391        manager.add_ignore(entry);
392
393        assert_eq!(manager.count(), 1);
394        assert!(manager.is_ignored("sha256:test"));
395        assert!(!manager.is_ignored("sha256:other"));
396
397        // Remove the ignore
398        assert!(manager.remove_ignore("sha256:test"));
399        assert_eq!(manager.count(), 0);
400        assert!(!manager.is_ignored("sha256:test"));
401    }
402
403    #[test]
404    fn test_ignore_manager_remove_nonexistent() {
405        let temp_dir = std::env::temp_dir();
406        let mut manager = IgnoreManager::new(&temp_dir);
407
408        assert!(!manager.remove_ignore("sha256:nonexistent"));
409    }
410}