Skip to main content

lang_check/
hashing.rs

1use std::collections::HashSet;
2use std::collections::hash_map::DefaultHasher;
3use std::hash::{Hash, Hasher};
4use std::path::{Path, PathBuf};
5
6use anyhow::Result;
7use serde::{Deserialize, Serialize};
8
9/// Round a byte offset down to the nearest char boundary.
10fn floor_char_boundary(s: &str, byte: usize) -> usize {
11    let mut i = byte.min(s.len());
12    while i > 0 && !s.is_char_boundary(i) {
13        i -= 1;
14    }
15    i
16}
17
18/// Round a byte offset up to the nearest char boundary.
19fn ceil_char_boundary(s: &str, byte: usize) -> usize {
20    let mut i = byte.min(s.len());
21    while i < s.len() && !s.is_char_boundary(i) {
22        i += 1;
23    }
24    i
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct DiagnosticFingerprint {
29    pub message_hash: u64,
30    pub context_hash: u64,
31    pub anchor_hash: u64,
32}
33
34impl DiagnosticFingerprint {
35    #[must_use]
36    pub fn new(message: &str, text: &str, start_byte: usize, end_byte: usize) -> Self {
37        let mut message_hasher = DefaultHasher::new();
38        message.hash(&mut message_hasher);
39
40        // Extract context: up to 20 chars before and after, snapped to char boundaries
41        let start = floor_char_boundary(text, start_byte.saturating_sub(20));
42        let end = ceil_char_boundary(text, (end_byte + 20).min(text.len()));
43        let context = &text[start..end];
44
45        let mut context_hasher = DefaultHasher::new();
46        context.hash(&mut context_hasher);
47
48        // Fuzzy anchor: 3 words before and after the error span
49        let mut anchor_hasher = DefaultHasher::new();
50        Self::extract_word_anchor(text, start_byte, end_byte).hash(&mut anchor_hasher);
51
52        Self {
53            message_hash: message_hasher.finish(),
54            context_hash: context_hasher.finish(),
55            anchor_hash: anchor_hasher.finish(),
56        }
57    }
58
59    fn extract_word_anchor(text: &str, start_byte: usize, end_byte: usize) -> String {
60        let sb = floor_char_boundary(text, start_byte.min(text.len()));
61        let before: String = text[..sb]
62            .split_whitespace()
63            .rev()
64            .take(3)
65            .collect::<Vec<_>>()
66            .into_iter()
67            .rev()
68            .collect::<Vec<_>>()
69            .join(" ");
70        let eb = ceil_char_boundary(text, end_byte.min(text.len()));
71        let after: String = text[eb..]
72            .split_whitespace()
73            .take(3)
74            .collect::<Vec<_>>()
75            .join(" ");
76        format!("{before}|{after}")
77    }
78
79    fn combined_hash(&self) -> u64 {
80        let mut hasher = DefaultHasher::new();
81        self.message_hash.hash(&mut hasher);
82        self.context_hash.hash(&mut hasher);
83        self.anchor_hash.hash(&mut hasher);
84        hasher.finish()
85    }
86}
87
88#[derive(Serialize, Deserialize)]
89struct IgnoreStoreData {
90    fingerprints: Vec<u64>,
91}
92
93pub struct IgnoreStore {
94    ignored_fingerprints: HashSet<u64>,
95    persist_path: Option<PathBuf>,
96}
97
98impl Default for IgnoreStore {
99    fn default() -> Self {
100        Self::new()
101    }
102}
103
104impl IgnoreStore {
105    #[must_use]
106    pub fn new() -> Self {
107        Self {
108            ignored_fingerprints: HashSet::new(),
109            persist_path: None,
110        }
111    }
112
113    /// Load an `IgnoreStore` from a workspace root, reading `.languagecheck/ignores.json`.
114    pub fn load(workspace_root: &Path) -> Result<Self> {
115        let persist_path = workspace_root.join(".languagecheck").join("ignores.json");
116        let mut store = Self {
117            ignored_fingerprints: HashSet::new(),
118            persist_path: Some(persist_path.clone()),
119        };
120
121        if persist_path.exists() {
122            let data = std::fs::read_to_string(&persist_path)?;
123            let stored: IgnoreStoreData = serde_json::from_str(&data)?;
124            store.ignored_fingerprints = stored.fingerprints.into_iter().collect();
125        }
126
127        Ok(store)
128    }
129
130    pub fn ignore(&mut self, fingerprint: &DiagnosticFingerprint) {
131        self.ignored_fingerprints
132            .insert(fingerprint.combined_hash());
133        if let Err(e) = self.persist() {
134            eprintln!("Warning: failed to persist ignore store: {e}");
135        }
136    }
137
138    #[must_use]
139    pub fn is_ignored(&self, fingerprint: &DiagnosticFingerprint) -> bool {
140        self.ignored_fingerprints
141            .contains(&fingerprint.combined_hash())
142    }
143
144    fn persist(&self) -> Result<()> {
145        let Some(path) = &self.persist_path else {
146            return Ok(());
147        };
148
149        if let Some(parent) = path.parent() {
150            std::fs::create_dir_all(parent)?;
151        }
152
153        let data = IgnoreStoreData {
154            fingerprints: self.ignored_fingerprints.iter().copied().collect(),
155        };
156        std::fs::write(path, serde_json::to_string_pretty(&data)?)?;
157        Ok(())
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    #[test]
166    fn fingerprint_same_input_same_hash() {
167        let fp1 = DiagnosticFingerprint::new("bad grammar", "This has bad grammar here.", 9, 12);
168        let fp2 = DiagnosticFingerprint::new("bad grammar", "This has bad grammar here.", 9, 12);
169        assert_eq!(fp1.combined_hash(), fp2.combined_hash());
170    }
171
172    #[test]
173    fn fingerprint_different_message_different_hash() {
174        let fp1 = DiagnosticFingerprint::new("bad grammar", "This has bad grammar here.", 9, 12);
175        let fp2 = DiagnosticFingerprint::new("spelling error", "This has bad grammar here.", 9, 12);
176        assert_ne!(fp1.combined_hash(), fp2.combined_hash());
177    }
178
179    #[test]
180    fn fingerprint_different_context_different_hash() {
181        let fp1 = DiagnosticFingerprint::new("error", "AAA error BBB", 4, 9);
182        let fp2 = DiagnosticFingerprint::new("error", "CCC error DDD", 4, 9);
183        assert_ne!(fp1.combined_hash(), fp2.combined_hash());
184    }
185
186    #[test]
187    fn fingerprint_word_anchor_extraction() {
188        let text = "one two three ERROR four five six";
189        let anchor = DiagnosticFingerprint::extract_word_anchor(text, 14, 19);
190        assert_eq!(anchor, "one two three|four five six");
191    }
192
193    #[test]
194    fn fingerprint_word_anchor_at_start() {
195        let text = "ERROR some words after";
196        let anchor = DiagnosticFingerprint::extract_word_anchor(text, 0, 5);
197        assert_eq!(anchor, "|some words after");
198    }
199
200    #[test]
201    fn fingerprint_word_anchor_at_end() {
202        let text = "words before ERROR";
203        let anchor = DiagnosticFingerprint::extract_word_anchor(text, 13, 18);
204        assert_eq!(anchor, "words before|");
205    }
206
207    #[test]
208    fn ignore_store_basic_operations() {
209        let mut store = IgnoreStore::new();
210        let fp = DiagnosticFingerprint::new("test msg", "some test msg context", 5, 13);
211
212        assert!(!store.is_ignored(&fp));
213        store.ignore(&fp);
214        assert!(store.is_ignored(&fp));
215    }
216
217    #[test]
218    fn ignore_store_does_not_ignore_different_fingerprint() {
219        let mut store = IgnoreStore::new();
220        let fp1 = DiagnosticFingerprint::new("msg A", "context A msg A here", 10, 15);
221        let fp2 = DiagnosticFingerprint::new("msg B", "context B msg B here", 10, 15);
222
223        store.ignore(&fp1);
224        assert!(store.is_ignored(&fp1));
225        assert!(!store.is_ignored(&fp2));
226    }
227
228    #[test]
229    fn ignore_store_persistence_roundtrip() {
230        let dir = std::env::temp_dir().join("lang_check_test_ignore_persist");
231        let _ = std::fs::remove_dir_all(&dir);
232        std::fs::create_dir_all(&dir).unwrap();
233
234        let fp = DiagnosticFingerprint::new("persist test", "the persist test text", 4, 16);
235
236        // Write
237        {
238            let mut store = IgnoreStore::load(&dir).unwrap();
239            store.ignore(&fp);
240        }
241
242        // Read back
243        {
244            let store = IgnoreStore::load(&dir).unwrap();
245            assert!(store.is_ignored(&fp));
246        }
247
248        let _ = std::fs::remove_dir_all(&dir);
249    }
250
251    #[test]
252    fn fingerprint_handles_multibyte_utf8() {
253        // Byte offsets that land inside multi-byte chars must not panic
254        let text = "Ärger mit Ölförderung"; // 'Ä' is 2 bytes, 'ö' is 2 bytes
255        // 'Ä' occupies bytes 0..2, 'r' is byte 2, etc.
256        // Deliberately pick a byte offset inside 'ö' (byte 10 is start of 'ö', byte 11 is mid-char)
257        let fp = DiagnosticFingerprint::new("test", text, 11, 15);
258        // Should not panic — just verify it produces a hash
259        assert!(fp.combined_hash() != 0 || fp.combined_hash() == 0);
260    }
261
262    #[test]
263    fn ignore_store_empty_persistence() {
264        let dir = std::env::temp_dir().join("lang_check_test_ignore_empty");
265        let _ = std::fs::remove_dir_all(&dir);
266        std::fs::create_dir_all(&dir).unwrap();
267
268        let store = IgnoreStore::load(&dir).unwrap();
269        let fp = DiagnosticFingerprint::new("not ignored", "some context", 0, 5);
270        assert!(!store.is_ignored(&fp));
271
272        let _ = std::fs::remove_dir_all(&dir);
273    }
274}