Skip to main content

vdsl_sync/infra/
hasher.rs

1//! Content identity hashing.
2//!
3//! Two-layer hash model:
4//! - **`file_hash`** — DJB2 of entire file bytes. Required for all files.
5//!   Used for change detection and generic duplicate detection.
6//! - **`content_hash`** — format-specific semantic hash (e.g. DJB2 of PNG IHDR+IDAT).
7//!   Used for high-precision duplicate detection that ignores metadata differences.
8//!
9//! [`ContentHasher`] is pluggable — default implementation ([`Djb2Hasher`]) computes
10//! both layers. PNG content_hash is Lua-compatible (`png.image_hash`).
11
12use std::io::{BufReader, Read, Seek, SeekFrom};
13use std::path::Path;
14
15use crate::infra::error::InfraError;
16
17/// Result of hashing a file.
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct HashResult {
20    /// DJB2 hash of entire file content. Always present.
21    pub file_hash: String,
22    /// Format-specific semantic hash (e.g. PNG pixel identity).
23    /// Present only for supported formats.
24    pub content_hash: Option<String>,
25}
26
27/// Pluggable content identity resolver.
28///
29/// Computes both generic file hash and format-specific content hash.
30pub trait ContentHasher: Send + Sync {
31    /// Compute hashes for the given file.
32    ///
33    /// `file_hash` is always computed. `content_hash` is computed
34    /// only for supported formats (e.g. PNG).
35    fn hash_file(&self, path: &Path) -> Result<HashResult, InfraError>;
36}
37
38/// Default hasher: DJB2 for all files + PNG IHDR+IDAT semantic hash.
39///
40/// - `file_hash`: DJB2 of entire file bytes (`%016x`).
41/// - `content_hash`: For PNG files, DJB2 of IHDR+IDAT chunks (`%016x`).
42///   Produces the same hash as Lua's `png.image_hash()`.
43pub struct Djb2Hasher;
44
45impl ContentHasher for Djb2Hasher {
46    fn hash_file(&self, path: &Path) -> Result<HashResult, InfraError> {
47        let file_hash = djb2_file_hash(path)?;
48
49        // content_hash is format-specific. Currently only PNG is supported.
50        // When adding new formats, update both here and FileType::from_extension.
51        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
52        let content_hash = if ext.eq_ignore_ascii_case("png") {
53            match png_image_hash(path) {
54                Ok(h) => Some(h),
55                Err(e) => {
56                    tracing::warn!(
57                        path = %path.display(),
58                        error = %e,
59                        "png content_hash failed — falling back to file_hash only"
60                    );
61                    None
62                }
63            }
64        } else {
65            None
66        };
67
68        Ok(HashResult {
69            file_hash,
70            content_hash,
71        })
72    }
73}
74
75/// Compute DJB2 hash of entire file content.
76///
77/// Reads file in 8KB chunks for memory efficiency.
78/// Returns 16-char hex string (`%016x`).
79pub fn djb2_file_hash(path: &Path) -> Result<String, InfraError> {
80    let file = std::fs::File::open(path).map_err(|e| InfraError::Hash {
81        op: "djb2",
82        reason: format!("open failed: {e}"),
83    })?;
84    let mut reader = BufReader::new(file);
85
86    let mut h: u64 = 5381;
87    let mut buf = [0u8; 8192];
88    loop {
89        let n = reader.read(&mut buf).map_err(|e| InfraError::Hash {
90            op: "djb2",
91            reason: format!("read failed: {e}"),
92        })?;
93        if n == 0 {
94            break;
95        }
96        // Use full u64 width for file hash (unlike PNG hash which uses 32-bit for Lua compat).
97        // This reduces collision probability from ~1/2^32 to ~1/2^64.
98        for &b in &buf[..n] {
99            h = h.wrapping_mul(33).wrapping_add(b as u64);
100        }
101    }
102    Ok(format!("{h:016x}"))
103}
104
105/// Compute DJB2 hash of IHDR + IDAT chunk data in a PNG file.
106///
107/// Algorithm matches Lua's `png.image_hash(filepath)`:
108/// 1. Verify PNG signature
109/// 2. Walk chunks, for IHDR and IDAT: feed chunk_type + chunk_data into DJB2
110/// 3. Return 16-char hex string (`%016x`)
111pub fn png_image_hash(path: &Path) -> Result<String, InfraError> {
112    let file = std::fs::File::open(path).map_err(|e| InfraError::Hash {
113        op: "png",
114        reason: format!("open failed: {e}"),
115    })?;
116    let mut reader = BufReader::new(file);
117
118    // Verify PNG signature
119    let mut sig = [0u8; 8];
120    reader.read_exact(&mut sig).map_err(|e| InfraError::Hash {
121        op: "png",
122        reason: format!("read sig failed: {e}"),
123    })?;
124    if sig != [137, 80, 78, 71, 13, 10, 26, 10] {
125        return Err(InfraError::Hash {
126            op: "png",
127            reason: "not a valid PNG file".into(),
128        });
129    }
130
131    let mut h: u64 = 5381;
132    let mut reached_iend = false;
133
134    loop {
135        let mut header = [0u8; 8];
136        match reader.read_exact(&mut header) {
137            Ok(()) => {}
138            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
139            Err(e) => {
140                return Err(InfraError::Hash {
141                    op: "png",
142                    reason: format!("read chunk header failed: {e}"),
143                })
144            }
145        }
146        // u32 → u64: always lossless (max 4_294_967_295). Further bounded by MAX_CHUNK_LEN below.
147        let length = u32::from_be_bytes([header[0], header[1], header[2], header[3]]) as u64;
148        // Guard against malicious chunk lengths (PNG spec max is 2^31-1)
149        const MAX_CHUNK_LEN: u64 = 0x7FFF_FFFF;
150        if length > MAX_CHUNK_LEN {
151            return Err(InfraError::Hash {
152                op: "png",
153                reason: format!("chunk length exceeds PNG spec maximum: {length}"),
154            });
155        }
156        let chunk_type = &header[4..8];
157
158        if chunk_type == b"IEND" {
159            reached_iend = true;
160            break;
161        }
162
163        if chunk_type == b"IHDR" || chunk_type == b"IDAT" {
164            // Hash chunk_type bytes
165            // Lua-compatible: compute DJB2 in 32-bit width (% 0x100000000)
166            // to produce the same value as Lua's png.image_hash().
167            for &b in chunk_type {
168                h = h.wrapping_mul(33).wrapping_add(b as u64) % 0x100000000;
169            }
170            // Hash chunk data
171            let mut remaining = length;
172            let mut buf = [0u8; 8192];
173            while remaining > 0 {
174                let to_read = std::cmp::min(remaining, buf.len() as u64) as usize;
175                reader
176                    .read_exact(&mut buf[..to_read])
177                    .map_err(|e| InfraError::Hash {
178                        op: "png",
179                        reason: format!("read data failed: {e}"),
180                    })?;
181                for &b in &buf[..to_read] {
182                    h = h.wrapping_mul(33).wrapping_add(b as u64) % 0x100000000;
183                }
184                remaining -= to_read as u64;
185            }
186            // Skip CRC (4 bytes)
187            reader
188                .seek(SeekFrom::Current(4))
189                .map_err(|e| InfraError::Hash {
190                    op: "png",
191                    reason: format!("seek crc failed: {e}"),
192                })?;
193        } else {
194            // Skip chunk data + CRC
195            let skip = i64::try_from(length).map_err(|_| InfraError::Hash {
196                op: "png",
197                reason: format!("chunk length overflow: {length}"),
198            })? + 4;
199            reader
200                .seek(SeekFrom::Current(skip))
201                .map_err(|e| InfraError::Hash {
202                    op: "png",
203                    reason: format!("seek skip failed: {e}"),
204                })?;
205        }
206    }
207
208    if !reached_iend {
209        return Err(InfraError::Hash {
210            op: "png",
211            reason: "truncated PNG: IEND chunk not found".into(),
212        });
213    }
214
215    Ok(format!("{h:016x}"))
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    /// Build a minimal valid PNG with given IDAT data and optional tEXt chunks.
223    fn build_test_png(idat_data: &[u8], text_chunks: &[(&str, &str)]) -> Vec<u8> {
224        let mut buf = Vec::new();
225        // PNG signature
226        buf.extend_from_slice(&[137, 80, 78, 71, 13, 10, 26, 10]);
227
228        // IHDR (1x1 RGB)
229        let ihdr = [0, 0, 0, 1, 0, 0, 0, 1, 8, 2, 0, 0, 0];
230        buf.extend_from_slice(&(ihdr.len() as u32).to_be_bytes());
231        buf.extend_from_slice(b"IHDR");
232        buf.extend_from_slice(&ihdr);
233        buf.extend_from_slice(&[0, 0, 0, 0]); // CRC placeholder
234
235        // tEXt chunks
236        for (keyword, text) in text_chunks {
237            let data: Vec<u8> = [keyword.as_bytes(), &[0], text.as_bytes()].concat();
238            buf.extend_from_slice(&(data.len() as u32).to_be_bytes());
239            buf.extend_from_slice(b"tEXt");
240            buf.extend_from_slice(&data);
241            buf.extend_from_slice(&[0, 0, 0, 0]); // CRC placeholder
242        }
243
244        // IDAT
245        buf.extend_from_slice(&(idat_data.len() as u32).to_be_bytes());
246        buf.extend_from_slice(b"IDAT");
247        buf.extend_from_slice(idat_data);
248        buf.extend_from_slice(&[0, 0, 0, 0]); // CRC placeholder
249
250        // IEND
251        buf.extend_from_slice(&0u32.to_be_bytes());
252        buf.extend_from_slice(b"IEND");
253        buf.extend_from_slice(&[0, 0, 0, 0]); // CRC placeholder
254
255        buf
256    }
257
258    // =========================================================================
259    // djb2_file_hash — generic file hash
260    // =========================================================================
261
262    #[test]
263    fn file_hash_non_empty() {
264        let dir = tempfile::tempdir().unwrap();
265        let path = dir.path().join("data.json");
266        std::fs::write(&path, b"{}").unwrap();
267        let hash = djb2_file_hash(&path).unwrap();
268        assert_eq!(hash.len(), 16);
269        assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
270    }
271
272    #[test]
273    fn file_hash_deterministic() {
274        let dir = tempfile::tempdir().unwrap();
275        let p1 = dir.path().join("a.txt");
276        let p2 = dir.path().join("b.txt");
277        std::fs::write(&p1, b"hello world").unwrap();
278        std::fs::write(&p2, b"hello world").unwrap();
279        assert_eq!(djb2_file_hash(&p1).unwrap(), djb2_file_hash(&p2).unwrap());
280    }
281
282    #[test]
283    fn file_hash_different_content() {
284        let dir = tempfile::tempdir().unwrap();
285        let p1 = dir.path().join("a.txt");
286        let p2 = dir.path().join("b.txt");
287        std::fs::write(&p1, b"content_a").unwrap();
288        std::fs::write(&p2, b"content_b").unwrap();
289        assert_ne!(djb2_file_hash(&p1).unwrap(), djb2_file_hash(&p2).unwrap());
290    }
291
292    #[test]
293    fn file_hash_empty_file() {
294        let dir = tempfile::tempdir().unwrap();
295        let path = dir.path().join("empty");
296        std::fs::write(&path, b"").unwrap();
297        let hash = djb2_file_hash(&path).unwrap();
298        // DJB2 initial value 5381 = 0x1505
299        assert_eq!(hash, "0000000000001505");
300    }
301
302    // =========================================================================
303    // png_image_hash — PNG semantic hash
304    // =========================================================================
305
306    #[test]
307    fn png_hash_valid() {
308        let dir = tempfile::tempdir().unwrap();
309        let path = dir.path().join("test.png");
310        std::fs::write(&path, build_test_png(b"PIXEL_DATA", &[])).unwrap();
311
312        let hash = png_image_hash(&path).unwrap();
313        assert_eq!(hash.len(), 16);
314        assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
315    }
316
317    #[test]
318    fn png_hash_not_png() {
319        let dir = tempfile::tempdir().unwrap();
320        let path = dir.path().join("not.png");
321        std::fs::write(&path, b"not a png").unwrap();
322        assert!(png_image_hash(&path).is_err());
323    }
324
325    #[test]
326    fn png_same_pixels_different_metadata() {
327        let dir = tempfile::tempdir().unwrap();
328        let p1 = dir.path().join("a.png");
329        let p2 = dir.path().join("b.png");
330        std::fs::write(&p1, build_test_png(b"SAME_PIXELS", &[])).unwrap();
331        std::fs::write(
332            &p2,
333            build_test_png(b"SAME_PIXELS", &[("vdsl", r#"{"seed":42}"#)]),
334        )
335        .unwrap();
336
337        let h1 = png_image_hash(&p1).unwrap();
338        let h2 = png_image_hash(&p2).unwrap();
339        assert_eq!(h1, h2, "same pixels must yield same content_hash");
340    }
341
342    #[test]
343    fn png_different_pixels() {
344        let dir = tempfile::tempdir().unwrap();
345        let p1 = dir.path().join("a.png");
346        let p2 = dir.path().join("b.png");
347        std::fs::write(&p1, build_test_png(b"PIXELS_AAA", &[])).unwrap();
348        std::fs::write(&p2, build_test_png(b"PIXELS_BBB", &[])).unwrap();
349
350        assert_ne!(png_image_hash(&p1).unwrap(), png_image_hash(&p2).unwrap());
351    }
352
353    #[test]
354    fn png_deterministic() {
355        let dir = tempfile::tempdir().unwrap();
356        let p1 = dir.path().join("d1.png");
357        let p2 = dir.path().join("d2.png");
358        let data = build_test_png(b"DETERMINISTIC", &[]);
359        std::fs::write(&p1, &data).unwrap();
360        std::fs::write(&p2, &data).unwrap();
361
362        let h1 = png_image_hash(&p1).unwrap();
363        let h2 = png_image_hash(&p2).unwrap();
364        assert_eq!(h1, h2);
365        assert_ne!(h1, "0000000000001505");
366    }
367
368    // =========================================================================
369    // Djb2Hasher (ContentHasher trait)
370    // =========================================================================
371
372    #[test]
373    fn hasher_non_png_no_content_hash() {
374        let hasher = Djb2Hasher;
375        let dir = tempfile::tempdir().unwrap();
376        let path = dir.path().join("data.json");
377        std::fs::write(&path, b"{}").unwrap();
378        let result = hasher.hash_file(&path).unwrap();
379        assert_eq!(result.file_hash.len(), 16);
380        assert!(result.content_hash.is_none());
381    }
382
383    #[test]
384    fn hasher_png_has_both_hashes() {
385        let hasher = Djb2Hasher;
386        let dir = tempfile::tempdir().unwrap();
387        let path = dir.path().join("test.png");
388        std::fs::write(&path, build_test_png(b"DATA", &[])).unwrap();
389        let result = hasher.hash_file(&path).unwrap();
390        assert_eq!(result.file_hash.len(), 16);
391        assert!(result.content_hash.is_some());
392        assert_eq!(result.content_hash.as_ref().unwrap().len(), 16);
393    }
394
395    #[test]
396    fn hasher_png_file_hash_differs_from_content_hash() {
397        let hasher = Djb2Hasher;
398        let dir = tempfile::tempdir().unwrap();
399        let path = dir.path().join("test.png");
400        std::fs::write(&path, build_test_png(b"PIXEL_DATA", &[])).unwrap();
401        let result = hasher.hash_file(&path).unwrap();
402        // file_hash includes PNG signature, tEXt, CRC etc — content_hash only IHDR+IDAT
403        assert_ne!(
404            result.file_hash,
405            result.content_hash.unwrap(),
406            "file_hash (whole file) and content_hash (IHDR+IDAT) should differ"
407        );
408    }
409
410    #[test]
411    fn hasher_png_same_pixels_different_metadata_same_content_different_file() {
412        let hasher = Djb2Hasher;
413        let dir = tempfile::tempdir().unwrap();
414        let p1 = dir.path().join("a.png");
415        let p2 = dir.path().join("b.png");
416        std::fs::write(&p1, build_test_png(b"SAME", &[])).unwrap();
417        std::fs::write(&p2, build_test_png(b"SAME", &[("key", "metadata")])).unwrap();
418        let r1 = hasher.hash_file(&p1).unwrap();
419        let r2 = hasher.hash_file(&p2).unwrap();
420
421        // content_hash is same (same pixels)
422        assert_eq!(r1.content_hash, r2.content_hash);
423        // file_hash differs (different metadata chunks → different total bytes)
424        assert_ne!(r1.file_hash, r2.file_hash);
425    }
426
427    /// Cross-language hash verification.
428    /// Requires Lua test to have written /tmp/vdsl_hash_test.png and .lua_hash.
429    /// Run explicitly: `cargo test cross_language_hash_match -- --ignored`
430    #[test]
431    #[ignore]
432    fn cross_language_hash_match() {
433        let png_path = Path::new("/tmp/vdsl_hash_test.png");
434        let hash_path = Path::new("/tmp/vdsl_hash_test.lua_hash");
435        assert!(
436            png_path.exists() && hash_path.exists(),
437            "required fixture files not found: /tmp/vdsl_hash_test.png and .lua_hash"
438        );
439        let rust_hash = png_image_hash(png_path).unwrap();
440        let lua_hash = std::fs::read_to_string(hash_path)
441            .unwrap()
442            .trim()
443            .to_string();
444        assert_eq!(
445            rust_hash, lua_hash,
446            "Rust hash ({rust_hash}) must match Lua hash ({lua_hash})"
447        );
448    }
449}