Skip to main content

superbook_pdf/
cache.rs

1//! Processing cache module for smart re-processing skip
2//!
3//! This module implements hash-based caching to skip re-processing
4//! of unchanged PDFs with the same options.
5
6use serde::{Deserialize, Serialize};
7use sha2::{Digest, Sha256};
8use std::fs;
9use std::io;
10use std::path::{Path, PathBuf};
11use std::time::SystemTime;
12
13/// Cache file extension
14pub const CACHE_EXTENSION: &str = ".superbook-cache";
15
16/// Current cache version
17pub const CACHE_VERSION: u32 = 1;
18
19/// Digest that uniquely identifies a processing run
20#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21pub struct CacheDigest {
22    /// Source file last modified time (Unix timestamp)
23    pub source_modified: u64,
24    /// Source file size in bytes
25    pub source_size: u64,
26    /// Hash of processing options
27    pub options_hash: String,
28}
29
30impl CacheDigest {
31    /// Create a new digest from source path and options
32    ///
33    /// # Arguments
34    /// * `source_path` - Path to the source PDF file
35    /// * `options_json` - JSON string of processing options
36    ///
37    /// # Returns
38    /// A new CacheDigest or an error if the file cannot be accessed
39    pub fn new<P: AsRef<Path>>(source_path: P, options_json: &str) -> io::Result<Self> {
40        let metadata = fs::metadata(source_path.as_ref())?;
41        let modified = metadata
42            .modified()
43            .unwrap_or(SystemTime::UNIX_EPOCH)
44            .duration_since(SystemTime::UNIX_EPOCH)
45            .unwrap_or_default()
46            .as_secs();
47        let size = metadata.len();
48
49        let mut hasher = Sha256::new();
50        hasher.update(options_json.as_bytes());
51        let hash = format!("sha256:{:x}", hasher.finalize());
52
53        Ok(Self {
54            source_modified: modified,
55            source_size: size,
56            options_hash: hash,
57        })
58    }
59
60    /// Create a digest with explicit values (for testing)
61    pub fn with_values(source_modified: u64, source_size: u64, options_hash: &str) -> Self {
62        Self {
63            source_modified,
64            source_size,
65            options_hash: options_hash.to_string(),
66        }
67    }
68}
69
70/// Processing result metadata
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct ProcessingResult {
73    /// Number of pages processed
74    pub page_count: usize,
75    /// Detected page number shift (if any)
76    pub page_number_shift: Option<i32>,
77    /// Whether vertical text was detected
78    pub is_vertical: bool,
79    /// Processing time in seconds
80    pub elapsed_seconds: f64,
81    /// Output file size in bytes
82    pub output_size: u64,
83}
84
85impl Default for ProcessingResult {
86    fn default() -> Self {
87        Self {
88            page_count: 0,
89            page_number_shift: None,
90            is_vertical: false,
91            elapsed_seconds: 0.0,
92            output_size: 0,
93        }
94    }
95}
96
97impl ProcessingResult {
98    /// Create a new ProcessingResult
99    pub fn new(
100        page_count: usize,
101        page_number_shift: Option<i32>,
102        is_vertical: bool,
103        elapsed_seconds: f64,
104        output_size: u64,
105    ) -> Self {
106        Self {
107            page_count,
108            page_number_shift,
109            is_vertical,
110            elapsed_seconds,
111            output_size,
112        }
113    }
114}
115
116/// Processing cache entry
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct ProcessingCache {
119    /// Cache version for compatibility check
120    pub version: u32,
121    /// Processing timestamp (Unix timestamp)
122    pub processed_at: u64,
123    /// Digest that identifies this processing run
124    pub digest: CacheDigest,
125    /// Processing result metadata
126    pub result: ProcessingResult,
127}
128
129impl ProcessingCache {
130    /// Create a new ProcessingCache
131    pub fn new(digest: CacheDigest, result: ProcessingResult) -> Self {
132        let processed_at = SystemTime::now()
133            .duration_since(SystemTime::UNIX_EPOCH)
134            .unwrap_or_default()
135            .as_secs();
136
137        Self {
138            version: CACHE_VERSION,
139            processed_at,
140            digest,
141            result,
142        }
143    }
144
145    /// Get the cache file path for an output file
146    pub fn cache_path<P: AsRef<Path>>(output_path: P) -> PathBuf {
147        let mut path = output_path.as_ref().as_os_str().to_owned();
148        path.push(CACHE_EXTENSION);
149        PathBuf::from(path)
150    }
151
152    /// Load cache from file
153    ///
154    /// # Arguments
155    /// * `output_path` - Path to the output PDF file (cache file is derived from this)
156    ///
157    /// # Returns
158    /// The loaded cache or an error
159    pub fn load<P: AsRef<Path>>(output_path: P) -> io::Result<Self> {
160        let cache_path = Self::cache_path(output_path);
161        let content = fs::read_to_string(&cache_path)?;
162        serde_json::from_str(&content)
163            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
164    }
165
166    /// Save cache to file
167    ///
168    /// # Arguments
169    /// * `output_path` - Path to the output PDF file (cache file is derived from this)
170    pub fn save<P: AsRef<Path>>(&self, output_path: P) -> io::Result<()> {
171        let cache_path = Self::cache_path(output_path);
172        let content = serde_json::to_string_pretty(self)
173            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
174        fs::write(&cache_path, content)
175    }
176
177    /// Check if the cache is valid for a given digest
178    ///
179    /// # Arguments
180    /// * `digest` - The digest to compare against
181    ///
182    /// # Returns
183    /// `true` if the cache is valid (version matches and digest matches)
184    pub fn is_valid(&self, digest: &CacheDigest) -> bool {
185        self.version == CACHE_VERSION && self.digest == *digest
186    }
187
188    /// Delete the cache file
189    pub fn delete<P: AsRef<Path>>(output_path: P) -> io::Result<()> {
190        let cache_path = Self::cache_path(output_path);
191        if cache_path.exists() {
192            fs::remove_file(cache_path)?;
193        }
194        Ok(())
195    }
196}
197
198/// Check if processing should be skipped based on cache
199///
200/// # Arguments
201/// * `source_path` - Path to the source PDF
202/// * `output_path` - Path to the output PDF
203/// * `options_json` - JSON string of processing options
204/// * `force` - If true, always return false (force re-processing)
205///
206/// # Returns
207/// `Some(cache)` if processing should be skipped, `None` otherwise
208pub fn should_skip_processing<P1: AsRef<Path>, P2: AsRef<Path>>(
209    source_path: P1,
210    output_path: P2,
211    options_json: &str,
212    force: bool,
213) -> Option<ProcessingCache> {
214    if force {
215        return None;
216    }
217
218    // Check if output exists
219    if !output_path.as_ref().exists() {
220        return None;
221    }
222
223    // Try to create digest
224    let digest = CacheDigest::new(&source_path, options_json).ok()?;
225
226    // Try to load and validate cache
227    let cache = ProcessingCache::load(&output_path).ok()?;
228    if cache.is_valid(&digest) {
229        Some(cache)
230    } else {
231        None
232    }
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238    use std::io::Write;
239    use tempfile::NamedTempFile;
240
241    // ============ CacheDigest Tests ============
242
243    #[test]
244    fn test_cache_digest_new() {
245        // TC: CACHE-001
246        let mut temp = NamedTempFile::new().unwrap();
247        temp.write_all(b"test content").unwrap();
248
249        let digest = CacheDigest::new(temp.path(), r#"{"dpi": 300}"#).unwrap();
250
251        assert!(digest.source_modified > 0);
252        assert_eq!(digest.source_size, 12); // "test content" = 12 bytes
253        assert!(digest.options_hash.starts_with("sha256:"));
254    }
255
256    #[test]
257    fn test_cache_digest_with_values() {
258        let digest = CacheDigest::with_values(1234567890, 999, "sha256:abc123");
259
260        assert_eq!(digest.source_modified, 1234567890);
261        assert_eq!(digest.source_size, 999);
262        assert_eq!(digest.options_hash, "sha256:abc123");
263    }
264
265    #[test]
266    fn test_cache_digest_same_options_same_hash() {
267        // TC: CACHE-002
268        let mut temp = NamedTempFile::new().unwrap();
269        temp.write_all(b"test").unwrap();
270
271        let digest1 = CacheDigest::new(temp.path(), r#"{"dpi": 300}"#).unwrap();
272        let digest2 = CacheDigest::new(temp.path(), r#"{"dpi": 300}"#).unwrap();
273
274        assert_eq!(digest1, digest2);
275    }
276
277    #[test]
278    fn test_cache_digest_different_file() {
279        // TC: CACHE-003
280        let mut temp1 = NamedTempFile::new().unwrap();
281        temp1.write_all(b"content1").unwrap();
282
283        let mut temp2 = NamedTempFile::new().unwrap();
284        temp2.write_all(b"different content").unwrap();
285
286        let digest1 = CacheDigest::new(temp1.path(), r#"{"dpi": 300}"#).unwrap();
287        let digest2 = CacheDigest::new(temp2.path(), r#"{"dpi": 300}"#).unwrap();
288
289        // Different sizes
290        assert_ne!(digest1.source_size, digest2.source_size);
291    }
292
293    #[test]
294    fn test_cache_digest_different_options() {
295        // TC: CACHE-004
296        let mut temp = NamedTempFile::new().unwrap();
297        temp.write_all(b"test").unwrap();
298
299        let digest1 = CacheDigest::new(temp.path(), r#"{"dpi": 300}"#).unwrap();
300        let digest2 = CacheDigest::new(temp.path(), r#"{"dpi": 600}"#).unwrap();
301
302        assert_ne!(digest1.options_hash, digest2.options_hash);
303    }
304
305    #[test]
306    fn test_cache_digest_nonexistent_file() {
307        let result = CacheDigest::new("/nonexistent/file.pdf", "{}");
308        assert!(result.is_err());
309    }
310
311    // ============ ProcessingResult Tests ============
312
313    #[test]
314    fn test_processing_result_default() {
315        let result = ProcessingResult::default();
316
317        assert_eq!(result.page_count, 0);
318        assert_eq!(result.page_number_shift, None);
319        assert!(!result.is_vertical);
320        assert_eq!(result.elapsed_seconds, 0.0);
321        assert_eq!(result.output_size, 0);
322    }
323
324    #[test]
325    fn test_processing_result_new() {
326        let result = ProcessingResult::new(100, Some(2), true, 45.5, 12345678);
327
328        assert_eq!(result.page_count, 100);
329        assert_eq!(result.page_number_shift, Some(2));
330        assert!(result.is_vertical);
331        assert_eq!(result.elapsed_seconds, 45.5);
332        assert_eq!(result.output_size, 12345678);
333    }
334
335    // ============ ProcessingCache Tests ============
336
337    #[test]
338    fn test_processing_cache_new() {
339        let digest = CacheDigest::with_values(1234567890, 999, "sha256:abc");
340        let result = ProcessingResult::default();
341        let cache = ProcessingCache::new(digest.clone(), result);
342
343        assert_eq!(cache.version, CACHE_VERSION);
344        assert!(cache.processed_at > 0);
345        assert_eq!(cache.digest, digest);
346    }
347
348    #[test]
349    fn test_processing_cache_path() {
350        let path = ProcessingCache::cache_path("/output/file.pdf");
351        assert_eq!(path.to_string_lossy(), "/output/file.pdf.superbook-cache");
352    }
353
354    #[test]
355    fn test_processing_cache_save_load() {
356        // TC: CACHE-005, CACHE-006
357        let temp_dir = tempfile::tempdir().unwrap();
358        let output_path = temp_dir.path().join("output.pdf");
359
360        // Create and save cache
361        let digest = CacheDigest::with_values(1234567890, 999, "sha256:abc");
362        let result = ProcessingResult::new(50, Some(3), true, 10.5, 5000000);
363        let cache = ProcessingCache::new(digest.clone(), result);
364        cache.save(&output_path).unwrap();
365
366        // Load and verify
367        let loaded = ProcessingCache::load(&output_path).unwrap();
368        assert_eq!(loaded.version, cache.version);
369        assert_eq!(loaded.digest, cache.digest);
370        assert_eq!(loaded.result.page_count, 50);
371        assert_eq!(loaded.result.page_number_shift, Some(3));
372        assert!(loaded.result.is_vertical);
373    }
374
375    #[test]
376    fn test_processing_cache_load_nonexistent() {
377        // TC: CACHE-007
378        let result = ProcessingCache::load("/nonexistent/file.pdf");
379        assert!(result.is_err());
380    }
381
382    #[test]
383    fn test_processing_cache_is_valid_same() {
384        let digest = CacheDigest::with_values(1234567890, 999, "sha256:abc");
385        let cache = ProcessingCache::new(digest.clone(), ProcessingResult::default());
386
387        assert!(cache.is_valid(&digest));
388    }
389
390    #[test]
391    fn test_processing_cache_is_valid_different_digest() {
392        let digest1 = CacheDigest::with_values(1234567890, 999, "sha256:abc");
393        let digest2 = CacheDigest::with_values(1234567890, 1000, "sha256:abc");
394        let cache = ProcessingCache::new(digest1, ProcessingResult::default());
395
396        assert!(!cache.is_valid(&digest2));
397    }
398
399    #[test]
400    fn test_processing_cache_version_mismatch() {
401        // TC: CACHE-008
402        let temp_dir = tempfile::tempdir().unwrap();
403        let output_path = temp_dir.path().join("output.pdf");
404
405        // Create cache with future version
406        let cache_content = r#"{
407            "version": 999,
408            "processed_at": 1234567890,
409            "digest": {
410                "source_modified": 1234567890,
411                "source_size": 999,
412                "options_hash": "sha256:abc"
413            },
414            "result": {
415                "page_count": 10,
416                "page_number_shift": null,
417                "is_vertical": false,
418                "elapsed_seconds": 5.0,
419                "output_size": 1000
420            }
421        }"#;
422
423        let cache_path = ProcessingCache::cache_path(&output_path);
424        fs::write(&cache_path, cache_content).unwrap();
425
426        let loaded = ProcessingCache::load(&output_path).unwrap();
427        let digest = CacheDigest::with_values(1234567890, 999, "sha256:abc");
428
429        // Version mismatch should make it invalid
430        assert!(!loaded.is_valid(&digest));
431    }
432
433    #[test]
434    fn test_processing_cache_corrupted() {
435        // TC: CACHE-009
436        let temp_dir = tempfile::tempdir().unwrap();
437        let output_path = temp_dir.path().join("output.pdf");
438        let cache_path = ProcessingCache::cache_path(&output_path);
439
440        fs::write(&cache_path, "not valid json").unwrap();
441
442        let result = ProcessingCache::load(&output_path);
443        assert!(result.is_err());
444    }
445
446    #[test]
447    fn test_processing_cache_delete() {
448        let temp_dir = tempfile::tempdir().unwrap();
449        let output_path = temp_dir.path().join("output.pdf");
450
451        // Create cache
452        let digest = CacheDigest::with_values(1234567890, 999, "sha256:abc");
453        let cache = ProcessingCache::new(digest, ProcessingResult::default());
454        cache.save(&output_path).unwrap();
455
456        // Verify exists
457        let cache_path = ProcessingCache::cache_path(&output_path);
458        assert!(cache_path.exists());
459
460        // Delete
461        ProcessingCache::delete(&output_path).unwrap();
462        assert!(!cache_path.exists());
463    }
464
465    #[test]
466    fn test_processing_cache_delete_nonexistent() {
467        // Should not error when deleting nonexistent cache
468        let result = ProcessingCache::delete("/nonexistent/file.pdf");
469        assert!(result.is_ok());
470    }
471
472    // ============ should_skip_processing Tests ============
473
474    #[test]
475    fn test_should_skip_with_force() {
476        // TC: CACHE-010
477        let temp_dir = tempfile::tempdir().unwrap();
478        let source_path = temp_dir.path().join("source.pdf");
479        let output_path = temp_dir.path().join("output.pdf");
480
481        fs::write(&source_path, "source").unwrap();
482        fs::write(&output_path, "output").unwrap();
483
484        // Create valid cache
485        let digest = CacheDigest::new(&source_path, "{}").unwrap();
486        let cache = ProcessingCache::new(digest, ProcessingResult::default());
487        cache.save(&output_path).unwrap();
488
489        // With force=true, should not skip
490        let result = should_skip_processing(&source_path, &output_path, "{}", true);
491        assert!(result.is_none());
492    }
493
494    #[test]
495    fn test_should_skip_no_output() {
496        let temp_dir = tempfile::tempdir().unwrap();
497        let source_path = temp_dir.path().join("source.pdf");
498        let output_path = temp_dir.path().join("output.pdf");
499
500        fs::write(&source_path, "source").unwrap();
501        // output does not exist
502
503        let result = should_skip_processing(&source_path, &output_path, "{}", false);
504        assert!(result.is_none());
505    }
506
507    #[test]
508    fn test_should_skip_no_cache() {
509        let temp_dir = tempfile::tempdir().unwrap();
510        let source_path = temp_dir.path().join("source.pdf");
511        let output_path = temp_dir.path().join("output.pdf");
512
513        fs::write(&source_path, "source").unwrap();
514        fs::write(&output_path, "output").unwrap();
515        // No cache file
516
517        let result = should_skip_processing(&source_path, &output_path, "{}", false);
518        assert!(result.is_none());
519    }
520
521    #[test]
522    fn test_should_skip_valid_cache() {
523        let temp_dir = tempfile::tempdir().unwrap();
524        let source_path = temp_dir.path().join("source.pdf");
525        let output_path = temp_dir.path().join("output.pdf");
526
527        fs::write(&source_path, "source").unwrap();
528        fs::write(&output_path, "output").unwrap();
529
530        // Create valid cache
531        let options = r#"{"dpi": 300}"#;
532        let digest = CacheDigest::new(&source_path, options).unwrap();
533        let result = ProcessingResult::new(10, None, false, 5.0, 1000);
534        let cache = ProcessingCache::new(digest, result);
535        cache.save(&output_path).unwrap();
536
537        // Should skip with same options
538        let skip_result = should_skip_processing(&source_path, &output_path, options, false);
539        assert!(skip_result.is_some());
540        assert_eq!(skip_result.unwrap().result.page_count, 10);
541    }
542
543    #[test]
544    fn test_should_skip_options_changed() {
545        let temp_dir = tempfile::tempdir().unwrap();
546        let source_path = temp_dir.path().join("source.pdf");
547        let output_path = temp_dir.path().join("output.pdf");
548
549        fs::write(&source_path, "source").unwrap();
550        fs::write(&output_path, "output").unwrap();
551
552        // Create cache with different options
553        let digest = CacheDigest::new(&source_path, r#"{"dpi": 300}"#).unwrap();
554        let cache = ProcessingCache::new(digest, ProcessingResult::default());
555        cache.save(&output_path).unwrap();
556
557        // Should NOT skip with different options
558        let result = should_skip_processing(&source_path, &output_path, r#"{"dpi": 600}"#, false);
559        assert!(result.is_none());
560    }
561}