Skip to main content

rust_par2/
verify.rs

1//! File verification against PAR2 checksums.
2//!
3//! Verifies files on disk by computing MD5 hashes and comparing them against
4//! the hashes stored in the PAR2 file set. Optionally performs per-slice
5//! CRC32/MD5 checks to identify exactly which blocks are damaged.
6
7use std::io::Read;
8use std::path::Path;
9
10use md5::{Digest, Md5};
11use rayon::prelude::*;
12use tracing::{debug, info, trace, warn};
13
14use crate::types::{DamagedFile, MissingFile, Par2FileSet, VerifiedFile, VerifyResult};
15
16/// Verify all files in a PAR2 set against actual files in a directory.
17///
18/// For each file described in the PAR2 set:
19/// - If the file exists and its MD5 matches → `intact`
20/// - If the file exists but MD5 doesn't match → `damaged` (with per-block detail)
21/// - If the file doesn't exist → `missing`
22pub fn verify(file_set: &Par2FileSet, dir: &Path) -> VerifyResult {
23    // Sort files by name for deterministic output
24    let mut files: Vec<_> = file_set.files.values().collect();
25    files.sort_by_key(|f| &f.filename);
26
27    // Verify files in parallel using rayon — each file is independent I/O + MD5
28    enum FileResult {
29        Intact(VerifiedFile),
30        Damaged(DamagedFile),
31        Missing(MissingFile),
32    }
33
34    let results: Vec<FileResult> = files
35        .par_iter()
36        .map(|par2_file| {
37            let file_path = dir.join(&par2_file.filename);
38
39            if !file_path.exists() {
40                debug!(filename = par2_file.filename, "file missing");
41                let block_count = blocks_for_file(par2_file.size, file_set.slice_size);
42                return FileResult::Missing(MissingFile {
43                    filename: par2_file.filename.clone(),
44                    expected_size: par2_file.size,
45                    block_count,
46                });
47            }
48
49            // Check file size first (fast reject)
50            let metadata = match std::fs::metadata(&file_path) {
51                Ok(m) => m,
52                Err(e) => {
53                    warn!(filename = par2_file.filename, error = %e, "cannot stat file");
54                    let block_count = blocks_for_file(par2_file.size, file_set.slice_size);
55                    return FileResult::Missing(MissingFile {
56                        filename: par2_file.filename.clone(),
57                        expected_size: par2_file.size,
58                        block_count,
59                    });
60                }
61            };
62
63            if metadata.len() != par2_file.size {
64                debug!(
65                    filename = par2_file.filename,
66                    expected = par2_file.size,
67                    actual = metadata.len(),
68                    "file size mismatch"
69                );
70                let total_blocks = blocks_for_file(par2_file.size, file_set.slice_size);
71                return FileResult::Damaged(DamagedFile {
72                    filename: par2_file.filename.clone(),
73                    size: metadata.len(),
74                    damaged_block_count: total_blocks,
75                    total_block_count: total_blocks,
76                    damaged_block_indices: (0..total_blocks).collect(),
77                });
78            }
79
80            // Compute full-file MD5
81            match compute_file_md5(&file_path) {
82                Ok(hash) => {
83                    if hash == par2_file.hash {
84                        trace!(filename = par2_file.filename, "file OK (MD5 match)");
85                        FileResult::Intact(VerifiedFile {
86                            filename: par2_file.filename.clone(),
87                            size: par2_file.size,
88                        })
89                    } else {
90                        let total_blocks = blocks_for_file(par2_file.size, file_set.slice_size);
91                        let bad_indices =
92                            find_damaged_blocks(&file_path, &par2_file.slices, file_set.slice_size);
93                        let damaged_blocks = bad_indices.len() as u32;
94
95                        debug!(
96                            filename = par2_file.filename,
97                            damaged_blocks,
98                            total_blocks,
99                            bad_indices = ?bad_indices,
100                            "file damaged (MD5 mismatch)"
101                        );
102
103                        FileResult::Damaged(DamagedFile {
104                            filename: par2_file.filename.clone(),
105                            size: par2_file.size,
106                            damaged_block_count: damaged_blocks,
107                            total_block_count: total_blocks,
108                            damaged_block_indices: bad_indices,
109                        })
110                    }
111                }
112                Err(e) => {
113                    warn!(filename = par2_file.filename, error = %e, "cannot hash file");
114                    let total_blocks = blocks_for_file(par2_file.size, file_set.slice_size);
115                    FileResult::Damaged(DamagedFile {
116                        filename: par2_file.filename.clone(),
117                        size: par2_file.size,
118                        damaged_block_count: total_blocks,
119                        total_block_count: total_blocks,
120                        damaged_block_indices: (0..total_blocks).collect(),
121                    })
122                }
123            }
124        })
125        .collect();
126
127    // Gather parallel results into separate vectors
128    let mut intact = Vec::new();
129    let mut damaged = Vec::new();
130    let mut missing = Vec::new();
131    for r in results {
132        match r {
133            FileResult::Intact(f) => intact.push(f),
134            FileResult::Damaged(f) => damaged.push(f),
135            FileResult::Missing(f) => missing.push(f),
136        }
137    }
138
139    // Count recovery blocks from ALL .par2 files in the directory (not just the index).
140    // The index file typically has 0 recovery blocks; they're in .vol*.par2 volumes.
141    let recovery_blocks_available = count_recovery_blocks_in_dir(dir, file_set);
142    let total_needed: u32 = damaged.iter().map(|d| d.damaged_block_count).sum::<u32>()
143        + missing.iter().map(|m| m.block_count).sum::<u32>();
144    let repair_possible = total_needed <= recovery_blocks_available;
145
146    info!(
147        intact = intact.len(),
148        damaged = damaged.len(),
149        missing = missing.len(),
150        blocks_needed = total_needed,
151        recovery_blocks_available,
152        "verification complete"
153    );
154
155    VerifyResult {
156        intact,
157        damaged,
158        missing,
159        recovery_blocks_available,
160        repair_possible,
161    }
162}
163
164/// Read buffer size for hashing. 2 MiB gives good kernel readahead and
165/// amortizes syscall overhead on large files.
166const HASH_BUF_SIZE: usize = 2 * 1024 * 1024;
167
168/// Compute the MD5 hash of a file using double-buffered I/O.
169/// One buffer is being hashed while the other is being filled by the OS,
170/// overlapping CPU and I/O work.
171fn compute_file_md5(path: &Path) -> std::io::Result<[u8; 16]> {
172    let mut file = std::fs::File::open(path)?;
173    let mut hasher = Md5::new();
174
175    let mut buf_a = vec![0u8; HASH_BUF_SIZE];
176    let mut buf_b = vec![0u8; HASH_BUF_SIZE];
177
178    // Fill first buffer
179    let mut n_a = file.read(&mut buf_a)?;
180
181    loop {
182        if n_a == 0 {
183            break;
184        }
185
186        // Start reading into buf_b while we hash buf_a.
187        // On Linux, the kernel's readahead will prefetch data for the next
188        // read while we're busy with MD5 computation.
189        let n_b = file.read(&mut buf_b)?;
190        hasher.update(&buf_a[..n_a]);
191
192        if n_b == 0 {
193            break;
194        }
195
196        // Now hash buf_b while reading into buf_a
197        n_a = file.read(&mut buf_a)?;
198        hasher.update(&buf_b[..n_b]);
199    }
200
201    Ok(hasher.finalize().into())
202}
203
204/// Compute the MD5 hash of the first 16 KiB of a file.
205///
206/// Useful for file identification when filenames are obfuscated.
207pub fn compute_hash_16k(path: &Path) -> std::io::Result<[u8; 16]> {
208    let mut file = std::fs::File::open(path)?;
209    let mut hasher = Md5::new();
210    let mut buf = [0u8; 16384]; // 16 KiB
211
212    let n = file.read(&mut buf)?;
213    hasher.update(&buf[..n]);
214
215    Ok(hasher.finalize().into())
216}
217
218/// Find the indices of damaged blocks via per-slice MD5 verification.
219/// Returns the 0-based indices of blocks that DON'T match their expected MD5.
220fn find_damaged_blocks(
221    path: &Path,
222    slices: &[crate::types::SliceChecksum],
223    slice_size: u64,
224) -> Vec<u32> {
225    if slices.is_empty() {
226        return vec![];
227    }
228
229    let mut file = match std::fs::File::open(path) {
230        Ok(f) => f,
231        Err(_) => return (0..slices.len() as u32).collect(),
232    };
233
234    let mut bad = Vec::new();
235    let mut buf = vec![0u8; slice_size as usize];
236
237    for (idx, expected) in slices.iter().enumerate() {
238        let n = match file.read(&mut buf) {
239            Ok(0) => {
240                // Remaining blocks are all missing
241                for i in idx..slices.len() {
242                    bad.push(i as u32);
243                }
244                break;
245            }
246            Ok(n) => n,
247            Err(_) => {
248                for i in idx..slices.len() {
249                    bad.push(i as u32);
250                }
251                break;
252            }
253        };
254
255        let mut hasher = Md5::new();
256        hasher.update(&buf[..n]);
257        if n < slice_size as usize {
258            let padding = vec![0u8; slice_size as usize - n];
259            hasher.update(&padding);
260        }
261        let hash: [u8; 16] = hasher.finalize().into();
262
263        if hash != expected.md5 {
264            bad.push(idx as u32);
265        }
266    }
267
268    bad
269}
270
271/// Count recovery blocks across all .par2 files in a directory.
272fn count_recovery_blocks_in_dir(dir: &Path, file_set: &Par2FileSet) -> u32 {
273    let entries = match std::fs::read_dir(dir) {
274        Ok(e) => e,
275        Err(_) => return file_set.recovery_block_count,
276    };
277
278    let mut count = 0u32;
279    for entry in entries.flatten() {
280        let path = entry.path();
281        if path
282            .extension()
283            .is_some_and(|e| e.eq_ignore_ascii_case("par2"))
284        {
285            if let Ok(parsed) = crate::packets::parse_par2_file(&path) {
286                if parsed.recovery_set_id == file_set.recovery_set_id {
287                    count += parsed.recovery_block_count;
288                }
289            }
290        }
291    }
292
293    // Fall back to file_set count if directory scan found nothing
294    if count == 0 {
295        file_set.recovery_block_count
296    } else {
297        count
298    }
299}
300
301/// Compute the number of slices (blocks) needed for a file of the given size.
302fn blocks_for_file(file_size: u64, slice_size: u64) -> u32 {
303    if slice_size == 0 {
304        return 0;
305    }
306    file_size.div_ceil(slice_size) as u32
307}
308
309// ---------------------------------------------------------------------------
310// Tests
311// ---------------------------------------------------------------------------
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316    use crate::packets::parse_par2_file;
317
318    /// Test verification of the intact par2test set.
319    #[test]
320    fn test_verify_intact_set() {
321        let par2_path =
322            Path::new("/home/sprooty/sabnzbd/tests/data/par2repair/basic/par2test.par2");
323        let dir = Path::new("/home/sprooty/sabnzbd/tests/data/par2repair/basic");
324
325        if !par2_path.exists() {
326            eprintln!("Skipping test: test data not found");
327            return;
328        }
329
330        let set = parse_par2_file(par2_path).unwrap();
331        let result = verify(&set, dir);
332
333        // The test data should have some intact and some problematic files.
334        // par2test.part2.rar (102400 bytes) should be intact.
335        // par2test.part1.rar is only 9 bytes (damaged/truncated).
336        // par2test.part5.rar is only 8 bytes.
337        // Some files might be missing.
338
339        println!("Verify result: {result}");
340        println!(
341            "  intact:  {:?}",
342            result
343                .intact
344                .iter()
345                .map(|f| &f.filename)
346                .collect::<Vec<_>>()
347        );
348        println!(
349            "  damaged: {:?}",
350            result
351                .damaged
352                .iter()
353                .map(|f| &f.filename)
354                .collect::<Vec<_>>()
355        );
356        println!(
357            "  missing: {:?}",
358            result
359                .missing
360                .iter()
361                .map(|f| &f.filename)
362                .collect::<Vec<_>>()
363        );
364
365        // We should have at least some results
366        let total = result.intact.len() + result.damaged.len() + result.missing.len();
367        assert_eq!(total, 6, "should account for all 6 files");
368    }
369
370    /// Test blocks_for_file calculation.
371    #[test]
372    fn test_blocks_for_file() {
373        assert_eq!(blocks_for_file(100000, 100000), 1);
374        assert_eq!(blocks_for_file(100001, 100000), 2);
375        assert_eq!(blocks_for_file(200000, 100000), 2);
376        assert_eq!(blocks_for_file(0, 100000), 0);
377        assert_eq!(blocks_for_file(1, 100000), 1);
378        assert_eq!(blocks_for_file(102400, 100000), 2);
379    }
380
381    /// Test compute_hash_16k.
382    #[test]
383    fn test_hash_16k() {
384        let path =
385            Path::new("/home/sprooty/sabnzbd/tests/data/par2repair/basic/par2test.part2.rar");
386        if !path.exists() {
387            eprintln!("Skipping test: test data not found");
388            return;
389        }
390
391        let hash = compute_hash_16k(path).unwrap();
392        // The hash should be non-zero
393        assert_ne!(hash, [0u8; 16], "hash should not be all zeros");
394    }
395
396    /// Test that hash_16k matches the PAR2 stored hash for an intact file.
397    #[test]
398    fn test_hash_16k_matches_par2() {
399        let par2_path =
400            Path::new("/home/sprooty/sabnzbd/tests/data/par2repair/basic/par2test.par2");
401        let dir = Path::new("/home/sprooty/sabnzbd/tests/data/par2repair/basic");
402
403        if !par2_path.exists() {
404            eprintln!("Skipping test: test data not found");
405            return;
406        }
407
408        let set = parse_par2_file(par2_path).unwrap();
409
410        // par2test.part2.rar should be an intact 102400-byte file
411        let part2 = set
412            .files
413            .values()
414            .find(|f| f.filename == "par2test.part2.rar")
415            .expect("part2 should exist in par2 set");
416
417        let file_path = dir.join("par2test.part2.rar");
418        if !file_path.exists() || std::fs::metadata(&file_path).unwrap().len() != part2.size {
419            eprintln!("Skipping: par2test.part2.rar is not the expected size");
420            return;
421        }
422
423        let computed = compute_hash_16k(&file_path).unwrap();
424        assert_eq!(
425            computed, part2.hash_16k,
426            "computed 16K hash should match PAR2 stored hash"
427        );
428    }
429}