codebook_downloader/
lib.rs

1use anyhow::Result;
2use chrono::{DateTime, Utc};
3use log::info;
4use reqwest::blocking::{Client, Response};
5use reqwest::header::{IF_MODIFIED_SINCE, LAST_MODIFIED};
6use rustls::ClientConfig;
7use rustls_platform_verifier::BuilderVerifierExt;
8use serde::{Deserialize, Serialize};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::fs::{self, File};
12use std::io::{BufReader, Read};
13use std::path::{Path, PathBuf};
14use std::sync::{OnceLock, RwLock};
15use tempfile::NamedTempFile;
16
17const METADATA_FILE: &str = "_metadata.json";
18const TWO_WEEKS: u64 = 14 * 24 * 3600;
19
20#[derive(Debug, Serialize, Deserialize, Default)]
21struct Metadata {
22    files: HashMap<String, FileEntry>,
23}
24
25#[derive(Debug, Serialize, Deserialize, Clone)]
26struct FileEntry {
27    path: PathBuf,
28    last_checked: DateTime<Utc>,
29    last_modified: Option<DateTime<Utc>>,
30    content_hash: String,
31}
32
33pub struct Downloader {
34    cache_dir: PathBuf,
35    metadata_path: PathBuf,
36    metadata: OnceLock<RwLock<Metadata>>,
37    _client: OnceLock<Client>,
38}
39
40impl Downloader {
41    pub fn new(cache_dir: impl AsRef<Path>) -> Self {
42        let cache_dir = cache_dir.as_ref().to_path_buf();
43        info!("Cache folder at: {cache_dir:?}");
44
45        let metadata_path = cache_dir.join(METADATA_FILE);
46
47        Self {
48            cache_dir,
49            metadata_path,
50            metadata: OnceLock::new(),
51            _client: OnceLock::new(),
52        }
53    }
54
55    fn client(&self) -> &Client {
56        self._client.get_or_init(|| {
57            // Set up rustls_platform_verifier to use OS cert chains (proxy support)
58            let arc_crypto_provider =
59                std::sync::Arc::new(rustls::crypto::aws_lc_rs::default_provider());
60            let config = ClientConfig::builder_with_provider(arc_crypto_provider)
61                .with_safe_default_protocol_versions()
62                .unwrap()
63                .with_platform_verifier()
64                .unwrap()
65                .with_no_client_auth();
66            reqwest::blocking::Client::builder()
67                .use_preconfigured_tls(config)
68                .build()
69                .expect("Failed to build http client")
70        })
71    }
72
73    fn metadata(&self) -> &RwLock<Metadata> {
74        let metadata_path = self.metadata_path.clone();
75        let cache_dir = self.cache_dir.clone();
76        self.metadata.get_or_init(move || {
77            fs::create_dir_all(&cache_dir)
78                .expect("Failed to create cache directory: {cache_dir:?}");
79            let metadata = Self::load_metadata(&metadata_path);
80            RwLock::new(metadata)
81        })
82    }
83
84    fn load_metadata(metadata_path: &Path) -> Metadata {
85        match File::open(metadata_path) {
86            Ok(file) => match serde_json::from_reader(file) {
87                Ok(metadata) => metadata,
88                Err(err) => {
89                    log::warn!("Failed to parse metadata file {metadata_path:?}: {err}");
90                    Metadata::default()
91                }
92            },
93            Err(err) => {
94                if err.kind() != std::io::ErrorKind::NotFound {
95                    log::warn!("Failed to open metadata file {metadata_path:?}: {err}");
96                }
97                Metadata::default()
98            }
99        }
100    }
101
102    fn persist_metadata(&self, metadata: &Metadata) -> Result<()> {
103        let file = File::create(&self.metadata_path)?;
104        serde_json::to_writer_pretty(file, metadata)?;
105        Ok(())
106    }
107
108    fn purge_stale_entry(&self, url: &str, stale_path: &Path) {
109        let mut metadata = self.metadata().write().unwrap();
110        if metadata
111            .files
112            .get(url)
113            .map(|entry| entry.path == stale_path)
114            .unwrap_or(false)
115        {
116            metadata.files.remove(url);
117            if let Err(err) = self.persist_metadata(&metadata) {
118                log::error!(
119                    "Failed to persist metadata after removing stale entry for {url}: {err}"
120                );
121            }
122        }
123    }
124
125    pub fn get(&self, url: &str) -> Result<PathBuf> {
126        let entry = {
127            let metadata = self.metadata().read().unwrap();
128            metadata.files.get(url).cloned()
129        };
130
131        let result = match entry {
132            Some(entry) => {
133                if !entry.path.exists() {
134                    self.purge_stale_entry(url, &entry.path);
135                    self.download_new(url)
136                } else {
137                    let needs_update =
138                        entry.last_checked.timestamp() + TWO_WEEKS as i64 <= Utc::now().timestamp();
139                    if needs_update {
140                        self.try_update(url)
141                    } else {
142                        Ok(entry.path)
143                    }
144                }
145            }
146            None => self.download_new(url),
147        };
148
149        result.or_else(|e| {
150            log::error!("Failed to update, using cached version: {e}");
151            let entry = {
152                let metadata = self.metadata().read().unwrap();
153                metadata
154                    .files
155                    .get(url)
156                    .map(|file_info| file_info.path.clone())
157            };
158            match entry {
159                Some(path) => {
160                    if path.exists() {
161                        Ok(path)
162                    } else {
163                        self.purge_stale_entry(url, &path);
164                        // If fallback path doesn't exist, try to download anyway
165                        self.download_new(url)
166                    }
167                }
168                None => {
169                    // URL not found in the files hashmap, try to download it
170                    log::warn!("URL not found in cache, attempting fresh download: {url}");
171                    self.download_new(url)
172                }
173            }
174        })
175    }
176
177    fn try_update(&self, url: &str) -> Result<PathBuf> {
178        // Get last modified time with read lock
179        let last_modified = {
180            self.metadata()
181                .read()
182                .unwrap()
183                .files
184                .get(url)
185                .and_then(|e| e.last_modified)
186        };
187        // log::info!("{:?}", last_modified);
188        // log::info!("URL {:?}", url);
189
190        let mut request = self.client().get(url);
191        if let Some(lm) = last_modified {
192            request = request.header(IF_MODIFIED_SINCE, lm.with_timezone(&Utc).to_rfc2822());
193        }
194        // log::info!("{:?}", request);
195
196        let response = request.send()?;
197        // log::info!("RESPONSE {:?}", response);
198
199        match response.status().as_u16() {
200            304 => self.update_check_time(url),
201            200 => self.handle_updated_response(url, response),
202            status => {
203                let _ = self.update_check_time(url);
204                Err(anyhow::anyhow!("Unexpected status code: {}", status))
205            }
206        }
207    }
208
209    fn handle_updated_response(&self, url: &str, response: Response) -> Result<PathBuf> {
210        let last_modified = parse_last_modified(&response);
211        let temp_file = self.download_to_temp(response)?;
212        let new_hash = compute_file_hash(temp_file.path())?;
213        let old_hash = {
214            let metadata = self
215                .metadata()
216                .read()
217                .map_err(|e| anyhow::anyhow!("Lock error: {}", e))?;
218            metadata.files.get(url).unwrap().content_hash.clone()
219        };
220        if new_hash == old_hash {
221            self.update_check_time(url)
222        } else {
223            self.replace_file(url, temp_file, last_modified, new_hash)
224        }
225    }
226
227    fn download_new(&self, url: &str) -> Result<PathBuf> {
228        let response = self.client().get(url).send()?;
229        let last_modified = parse_last_modified(&response);
230        let temp_file = self.download_to_temp(response)?;
231        let new_hash = compute_file_hash(temp_file.path())?;
232        self.store_new_file(url, temp_file, last_modified, new_hash)
233    }
234
235    fn download_to_temp(&self, mut response: Response) -> Result<NamedTempFile> {
236        let mut temp_file = NamedTempFile::new_in(&self.cache_dir)?;
237        std::io::copy(&mut response, &mut temp_file)?;
238        Ok(temp_file)
239    }
240
241    fn store_new_file(
242        &self,
243        url: &str,
244        temp_file: NamedTempFile,
245        last_modified: Option<DateTime<Utc>>,
246        content_hash: String,
247    ) -> Result<PathBuf> {
248        let filename = hash_url(url);
249        let path = self.cache_dir.join(filename);
250        temp_file.persist(&path)?;
251
252        let entry = FileEntry {
253            path: path.clone(),
254            last_checked: Utc::now(),
255            last_modified,
256            content_hash,
257        };
258        {
259            let mut metadata = self.metadata().write().unwrap();
260            metadata.files.insert(url.to_string(), entry);
261            self.persist_metadata(&metadata)?;
262        }
263        Ok(path)
264    }
265
266    fn replace_file(
267        &self,
268        url: &str,
269        temp_file: NamedTempFile,
270        last_modified: Option<DateTime<Utc>>,
271        content_hash: String,
272    ) -> Result<PathBuf> {
273        let new_path: PathBuf;
274        {
275            let mut metadata = self.metadata().write().unwrap();
276            let entry = metadata.files.get_mut(url).unwrap();
277            let old_path = entry.path.clone();
278
279            new_path = self.cache_dir.join(hash_url(url));
280            temp_file.persist(&new_path)?;
281
282            // Remove old file if it's different
283            if old_path != new_path && old_path.exists() {
284                fs::remove_file(old_path)?;
285            }
286
287            entry.path = new_path.clone();
288            entry.last_checked = Utc::now();
289            entry.last_modified = last_modified;
290            entry.content_hash = content_hash;
291            self.persist_metadata(&metadata)?;
292        }
293
294        Ok(new_path)
295    }
296
297    fn update_check_time(&self, url: &str) -> Result<PathBuf> {
298        let path: PathBuf;
299        {
300            let mut metadata = self.metadata().write().unwrap();
301            let entry = metadata.files.get_mut(url).unwrap();
302            entry.last_checked = Utc::now();
303            path = entry.path.clone();
304            self.persist_metadata(&metadata)?;
305        }
306        Ok(path)
307    }
308}
309
310fn hash_url(url: &str) -> String {
311    let mut hasher = Sha256::new();
312    hasher.update(url.as_bytes());
313    format!("{:x}", hasher.finalize())
314}
315
316fn compute_file_hash(path: &Path) -> Result<String> {
317    let file = File::open(path)?;
318    let mut reader = BufReader::new(file);
319    let mut hasher = Sha256::new();
320    let mut buffer = [0; 1024];
321
322    loop {
323        let count = reader.read(&mut buffer)?;
324        if count == 0 {
325            break;
326        }
327        hasher.update(&buffer[..count]);
328    }
329
330    Ok(format!("{:x}", hasher.finalize()))
331}
332
333fn parse_last_modified(response: &Response) -> Option<DateTime<Utc>> {
334    response
335        .headers()
336        .get(LAST_MODIFIED)
337        .and_then(|hv| hv.to_str().ok())
338        .and_then(|s| DateTime::parse_from_rfc2822(s).ok())
339        .map(|dt| dt.with_timezone(&Utc))
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345    use chrono::Duration;
346    use httpmock::MockServer;
347    use tempfile::tempdir;
348
349    #[test]
350    fn test_download_new_file() {
351        let server = MockServer::start();
352        let mock = server.mock(|when, then| {
353            when.method("GET").path("/test.txt");
354            then.status(200)
355                .body("test content")
356                .header("Last-Modified", "Wed, 21 Oct 2023 07:28:00 GMT");
357        });
358
359        let temp_dir = tempdir().unwrap();
360        let downloader = Downloader::new(temp_dir.path());
361        let path = downloader.get(&server.url("/test.txt")).unwrap();
362
363        mock.assert();
364        assert!(path.exists());
365        assert_eq!(std::fs::read_to_string(path).unwrap(), "test content");
366        let metadata = downloader.metadata().read().unwrap();
367        let entry = metadata.files.get(&server.url("/test.txt")).unwrap();
368        assert_eq!(entry.content_hash, compute_file_hash(&entry.path).unwrap());
369    }
370
371    #[test]
372    fn test_returns_cached_file_when_offline() {
373        let server = MockServer::start();
374        let temp_dir = tempdir().unwrap();
375
376        // First download to cache
377        let mock = server.mock(|when, then| {
378            when.method("GET").path("/test.txt");
379            then.status(200).body("cached content");
380        });
381
382        let downloader = Downloader::new(temp_dir.path());
383        let path = downloader.get(&server.url("/test.txt")).unwrap();
384        mock.assert();
385
386        // Now simulate offline
387        let downloader = Downloader::new(temp_dir.path());
388        // server.stop(); // Make sure server isn't running
389        let cached_path = downloader.get(&server.url("/test.txt")).unwrap();
390
391        assert_eq!(path, cached_path);
392        assert_eq!(
393            std::fs::read_to_string(cached_path).unwrap(),
394            "cached content"
395        );
396    }
397
398    #[test]
399    fn test_updates_file_when_modified() {
400        let server = MockServer::start();
401        let temp_dir = tempdir().unwrap();
402        let test_path = server.url("/test.txt");
403
404        // Initial download
405        let initial_last_modified = "Wed, 21 Oct 2020 07:28:00 GMT";
406        let mut mock1 = server.mock(|when, then| {
407            when.method("GET").path("/test.txt");
408            then.status(200)
409                .body("v1")
410                .header("Last-Modified", initial_last_modified);
411        });
412
413        let downloader = Downloader::new(temp_dir.path());
414        let path_v1 = downloader.get(&test_path).unwrap();
415        mock1.assert();
416        mock1.delete();
417
418        // Get stored metadata
419        let stored_last_modified = {
420            let metadata = downloader.metadata().read().unwrap();
421            metadata.files[&test_path].last_modified
422        };
423
424        // Set last checked to 3 weeks ago
425        {
426            let mut metadata = downloader.metadata().write().unwrap();
427            let entry = metadata.files.get_mut(&test_path).unwrap();
428            entry.last_checked = stored_last_modified.unwrap() - Duration::weeks(3);
429        }
430
431        // Update mock with new content
432        let mock2 = server.mock(|when, then| {
433            when.method("GET").path("/test.txt").header(
434                IF_MODIFIED_SINCE.as_str(),
435                stored_last_modified.unwrap().to_rfc2822(),
436            );
437            then.status(200)
438                .body("v2")
439                .header("Last-Modified", "Fri, 23 Oct 2020 07:28:00 GMT");
440        });
441
442        let path_v2 = downloader.get(&test_path).unwrap();
443        println!("path: {path_v2:?}");
444        mock2.assert();
445
446        assert_eq!(path_v1, path_v2);
447        assert_eq!(std::fs::read_to_string(path_v2).unwrap(), "v2");
448    }
449    #[test]
450    fn test_uses_stale_file_when_update_fails() {
451        let server = MockServer::start();
452        let temp_dir = tempdir().unwrap();
453
454        // Initial download
455        let mock1 = server.mock(|when, then| {
456            when.method("GET").path("/test.txt");
457            then.status(200).body("original");
458        });
459
460        let downloader = Downloader::new(temp_dir.path());
461        let original_path = downloader.get(&server.url("/test.txt")).unwrap();
462        mock1.assert();
463
464        // Force update check time and break the server
465        {
466            let mut metadata = downloader.metadata().try_write().unwrap();
467            if let Some(entry) = metadata.files.get_mut(&server.url("/test.txt")) {
468                entry.last_checked = Utc::now() - Duration::seconds(TWO_WEEKS as i64 * 2);
469            }
470        }
471        // server.stop();
472
473        let cached_path = downloader.get(&server.url("/test.txt")).unwrap();
474        assert_eq!(original_path, cached_path);
475    }
476
477    #[test]
478    fn test_doesnt_check_within_two_weeks() {
479        let server = MockServer::start();
480        let temp_dir = tempdir().unwrap();
481
482        // Initial download
483        let mock = server.mock(|when, then| {
484            when.method("GET").path("/test.txt");
485            then.status(200).body("content");
486        });
487
488        let downloader = Downloader::new(temp_dir.path());
489        downloader.get(&server.url("/test.txt")).unwrap();
490        mock.assert_calls(1);
491
492        // Subsequent call within two weeks
493        let mock2 = server.mock(|when, then| {
494            when.method("GET").path("/test.txt");
495            then.status(200);
496        });
497        downloader.get(&server.url("/test.txt")).unwrap();
498        mock2.assert_calls(0); // Should not make any new requests
499    }
500
501    #[test]
502    fn test_handles_304_not_modified() {
503        let server = MockServer::start();
504        let temp_dir = tempdir().unwrap();
505
506        // Initial download
507        let mut mock1 = server.mock(|when, then| {
508            when.method("GET").path("/test.txt");
509            then.status(200)
510                .body("content")
511                .header("Last-Modified", "Wed, 21 Oct 2020 07:28:00 GMT");
512        });
513
514        let downloader = Downloader::new(temp_dir.path());
515        let original_path = downloader.get(&server.url("/test.txt")).unwrap();
516        mock1.assert();
517        mock1.delete();
518
519        // Force check time
520        {
521            let mut metadata = downloader.metadata().write().unwrap();
522            if let Some(entry) = metadata.files.get_mut(&server.url("/test.txt")) {
523                entry.last_checked = DateTime::parse_from_rfc2822("Wed, 21 Oct 2020 07:28:00 GMT")
524                    .unwrap()
525                    .with_timezone(&Utc);
526            }
527        }
528
529        // 304 response
530        let mock2 = server.mock(|when, then| {
531            when.method("GET")
532                .path("/test.txt")
533                .header("If-Modified-Since", "Wed, 21 Oct 2020 07:28:00 +0000");
534            then.status(304);
535        });
536
537        let cached_path = downloader.get(&server.url("/test.txt")).unwrap();
538        mock2.assert();
539        assert_eq!(original_path, cached_path);
540        let metadata = downloader.metadata().read().unwrap();
541        let entry = metadata.files.get(&server.url("/test.txt")).unwrap();
542        assert!(entry.last_checked > Utc::now() - Duration::seconds(1));
543    }
544
545    #[test]
546    fn test_file_hashing() {
547        let url1 = "https://example.com/file1";
548        let url2 = "https://example.com/file2";
549
550        assert_ne!(hash_url(url1), hash_url(url2));
551
552        let same_url = "https://example.com/same";
553        assert_eq!(hash_url(same_url), hash_url(same_url));
554    }
555
556    #[test]
557    fn test_redownloads_when_file_missing() {
558        let server = MockServer::start();
559        let temp_dir = tempdir().unwrap();
560
561        // First download
562        let mut mock1 = server.mock(|when, then| {
563            when.method("GET").path("/test.txt");
564            then.status(200).body("content");
565        });
566
567        let downloader = Downloader::new(temp_dir.path());
568        let path = downloader.get(&server.url("/test.txt")).unwrap();
569        mock1.assert();
570        mock1.delete();
571
572        // Simulate file deletion but keep metadata
573        std::fs::remove_file(&path).unwrap();
574        assert!(!path.exists());
575
576        // Second request should redownload
577        let mock2 = server.mock(|when, then| {
578            when.method("GET").path("/test.txt");
579            then.status(200).body("redownloaded content");
580        });
581
582        let new_path = downloader.get(&server.url("/test.txt")).unwrap();
583        mock2.assert();
584
585        assert!(new_path.exists());
586        assert_eq!(
587            std::fs::read_to_string(&new_path).unwrap(),
588            "redownloaded content"
589        );
590    }
591}