Skip to main content

assist_rs/
data.rs

1//! Data file management for ASSIST.
2//!
3//! ASSIST needs three kinds of files to run end-to-end:
4//!
5//! 1. Planetary + asteroid ephemerides (`de440.bsp`, `sb441-n16.bsp`)
6//! 2. MPC observatory codes (`obscodes_extended.json`) for observer lookups
7//! 3. Earth orientation binary PCK kernels (`earth_*.bpc`) for sub-mas
8//!    ITRF93 → ICRF rotation of ground-based observatories. Three kernels
9//!    together span 1962 → ~2125: historical, current high-precision, and
10//!    long-term predict.
11//!
12//! [`DataManager`] downloads these to a local cache directory on demand and
13//! returns resolved paths for [`crate::Ephemeris::from_paths`],
14//! [`crate::ObservatoryTable::from_json`], and
15//! [`crate::earth_orientation::EarthOrientation::from_paths`].
16//!
17//! # Default data directory
18//!
19//! `$ASSIST_DATA_DIR` if set, otherwise `$XDG_CACHE_HOME/assist-rs/` or
20//! `~/.cache/assist-rs/`.
21//!
22//! # Example
23//!
24//! ```no_run
25//! use assist_rs::data::DataManager;
26//!
27//! let dm = DataManager::new();
28//! let paths = dm.ensure_ready()?;
29//! # Ok::<(), assist_rs::data::DataError>(())
30//! ```
31//!
32//! Each downloaded file gets a `<filename>.meta.json` sidecar with MD5 hash,
33//! Content-Length, and Last-Modified from the HTTP response. On subsequent
34//! runs, non-static files (the MPC obscodes file and
35//! `earth_latest_high_prec.bpc`) are checked via HEAD and re-downloaded if
36//! the MD5 doesn't match or the remote metadata differs.
37
38use std::fmt;
39use std::fs::{self, File};
40use std::io::{self, BufWriter, Read, Write};
41use std::path::{Path, PathBuf};
42
43use serde::{Deserialize, Serialize};
44
45// ─── Kernel catalog ─────────────────────────────────────────────────────────
46
47struct KernelEntry {
48    filename: &'static str,
49    url: &'static str,
50    gzipped: bool,
51    is_static: bool,
52}
53
54const DEFAULT_KERNELS: &[KernelEntry] = &[
55    KernelEntry {
56        filename: "de440.bsp",
57        url: "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/spk/planets/de440.bsp",
58        gzipped: false,
59        is_static: true,
60    },
61    KernelEntry {
62        filename: "sb441-n16.bsp",
63        url: "https://ssd.jpl.nasa.gov/ftp/eph/small_bodies/asteroids_de441/sb441-n16.bsp",
64        gzipped: false,
65        is_static: true,
66    },
67    KernelEntry {
68        filename: "obscodes_extended.json",
69        url: "https://minorplanetcenter.net/Extended_Files/obscodes_extended.json.gz",
70        gzipped: true,
71        is_static: false,
72    },
73    // Earth orientation binary PCKs. NAIF periodically republishes the
74    // historical and predict kernels with filenames encoding their coverage;
75    // bump the filenames here when upstream does. `earth_latest_high_prec.bpc`
76    // is a stable endpoint NAIF updates ~weekly.
77    KernelEntry {
78        filename: "earth_latest_high_prec.bpc",
79        url: "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/pck/earth_latest_high_prec.bpc",
80        gzipped: false,
81        is_static: false,
82    },
83    KernelEntry {
84        filename: "earth_620120_250826.bpc",
85        url: "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/pck/earth_620120_250826.bpc",
86        gzipped: false,
87        is_static: true,
88    },
89    KernelEntry {
90        filename: "earth_2025_250826_2125_predict.bpc",
91        url: "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/pck/earth_2025_250826_2125_predict.bpc",
92        gzipped: false,
93        is_static: true,
94    },
95];
96
97// ─── Sidecar metadata ──────────────────────────────────────────────────────
98
99#[derive(Debug, Serialize, Deserialize)]
100struct FileMeta {
101    url: String,
102    downloaded_at: u64,
103    content_length: Option<u64>,
104    last_modified: Option<String>,
105    md5: String,
106}
107
108// ─── AssistDataPaths ────────────────────────────────────────────────────────
109
110/// Resolved paths to all data files ASSIST needs.
111#[derive(Debug, Clone)]
112pub struct AssistDataPaths {
113    /// DE440 planetary ephemeris SPK.
114    pub planets: PathBuf,
115    /// SB441 N=16 small-body perturber SPK.
116    pub asteroids: PathBuf,
117    /// MPC observatory codes JSON (decompressed).
118    pub obscodes: PathBuf,
119    /// Current high-precision Earth orientation PCK, updated ~weekly by NAIF.
120    /// Covers approximately 2000 to the near future.
121    pub eop_high_prec: PathBuf,
122    /// Historical Earth orientation PCK (1962 → ~present).
123    pub eop_historical: PathBuf,
124    /// Long-term Earth orientation predict PCK (~2025 → 2125).
125    pub eop_predict: PathBuf,
126}
127
128impl AssistDataPaths {
129    /// The three EOP kernel paths in SPICE-idiomatic load order
130    /// (predict, historical, current) — pass this directly to
131    /// [`crate::earth_orientation::EarthOrientation::from_paths`] so the
132    /// current high-precision kernel wins wherever it has coverage.
133    pub fn eop_kernels(&self) -> [&PathBuf; 3] {
134        [&self.eop_predict, &self.eop_historical, &self.eop_high_prec]
135    }
136}
137
138// ─── DataError ──────────────────────────────────────────────────────────────
139
140/// Errors from the data manager.
141#[derive(Debug)]
142pub enum DataError {
143    /// Required files are missing (offline mode).
144    MissingFiles(Vec<String>),
145    /// HTTP request failed.
146    Http(String),
147    /// I/O error.
148    Io(io::Error),
149}
150
151impl fmt::Display for DataError {
152    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
153        match self {
154            Self::MissingFiles(files) => write!(f, "missing data files: {}", files.join(", ")),
155            Self::Http(msg) => write!(f, "HTTP error: {msg}"),
156            Self::Io(e) => write!(f, "I/O error: {e}"),
157        }
158    }
159}
160
161impl std::error::Error for DataError {}
162
163// ─── DataManager ────────────────────────────────────────────────────────────
164
165/// Manages downloading and caching of the files ASSIST needs.
166///
167/// Files are stored in a local data directory. Default location is
168/// `$ASSIST_DATA_DIR` if set, otherwise `$XDG_CACHE_HOME/assist-rs/` or
169/// `~/.cache/assist-rs/`.
170pub struct DataManager {
171    data_dir: PathBuf,
172}
173
174impl Default for DataManager {
175    fn default() -> Self {
176        Self::new()
177    }
178}
179
180impl DataManager {
181    /// Create a `DataManager` with the default data directory.
182    pub fn new() -> Self {
183        let data_dir = std::env::var("ASSIST_DATA_DIR")
184            .map(PathBuf::from)
185            .unwrap_or_else(|_| {
186                if let Ok(xdg) = std::env::var("XDG_CACHE_HOME") {
187                    PathBuf::from(xdg).join("assist-rs")
188                } else {
189                    let home = std::env::var("HOME").expect("HOME environment variable not set");
190                    PathBuf::from(home).join(".cache").join("assist-rs")
191                }
192            });
193        Self { data_dir }
194    }
195
196    /// Create a `DataManager` with a custom data directory.
197    pub fn with_dir(dir: impl Into<PathBuf>) -> Self {
198        Self {
199            data_dir: dir.into(),
200        }
201    }
202
203    /// The data directory path.
204    pub fn data_dir(&self) -> &Path {
205        &self.data_dir
206    }
207
208    fn paths(&self) -> AssistDataPaths {
209        AssistDataPaths {
210            planets: self.data_dir.join("de440.bsp"),
211            asteroids: self.data_dir.join("sb441-n16.bsp"),
212            obscodes: self.data_dir.join("obscodes_extended.json"),
213            eop_high_prec: self.data_dir.join("earth_latest_high_prec.bpc"),
214            eop_historical: self.data_dir.join("earth_620120_250826.bpc"),
215            eop_predict: self.data_dir.join("earth_2025_250826_2125_predict.bpc"),
216        }
217    }
218
219    /// Return paths if all data files exist. No network access.
220    pub fn offline(&self) -> Result<AssistDataPaths, DataError> {
221        let missing: Vec<String> = DEFAULT_KERNELS
222            .iter()
223            .filter(|e| !self.data_dir.join(e.filename).exists())
224            .map(|e| e.filename.to_string())
225            .collect();
226        if !missing.is_empty() {
227            return Err(DataError::MissingFiles(missing));
228        }
229        Ok(self.paths())
230    }
231
232    /// Ensure all three files exist, downloading any that are missing, locally
233    /// corrupted, or stale upstream.
234    ///
235    /// For each file:
236    ///
237    /// - If the file is missing → download.
238    /// - Else if a sidecar exists, compare the local file's MD5 against the
239    ///   stored MD5. Mismatch implies local corruption or tampering →
240    ///   re-download. An error computing the MD5 (e.g. unreadable file) is
241    ///   propagated, not swallowed.
242    /// - Else if the file is non-static (e.g. `obscodes_extended.json`), HEAD
243    ///   the remote and re-download if `Content-Length` or `Last-Modified`
244    ///   differs from the sidecar. Network failures here are propagated —
245    ///   callers that need offline-tolerant behavior should use
246    ///   [`Self::offline`] instead.
247    /// - Else (static file, MD5 matches, or no sidecar to check against) →
248    ///   keep the cached copy.
249    pub fn ensure_ready(&self) -> Result<AssistDataPaths, DataError> {
250        fs::create_dir_all(&self.data_dir).map_err(DataError::Io)?;
251
252        for entry in DEFAULT_KERNELS {
253            let path = self.data_dir.join(entry.filename);
254            let meta_path = self.data_dir.join(format!("{}.meta.json", entry.filename));
255
256            if !path.exists() {
257                eprintln!("Downloading {}...", entry.filename);
258                download(entry, &path, &meta_path)?;
259                continue;
260            }
261
262            let Ok(meta) = read_meta(&meta_path) else {
263                // No sidecar — can't validate, assume caller knows what they
264                // put in the cache directory.
265                continue;
266            };
267
268            // Integrity check: the local file's MD5 must match what we
269            // recorded when we downloaded it. Catches on-disk corruption and
270            // deliberate replacement. An error here means we can't even
271            // read the local file — propagate rather than silently trust
272            // an unverifiable cache entry.
273            if !local_md5_matches(&path, &meta.md5)? {
274                eprintln!("Re-downloading {} (local MD5 mismatch)...", entry.filename);
275                download(entry, &path, &meta_path)?;
276                continue;
277            }
278
279            if entry.is_static {
280                continue;
281            }
282
283            // Staleness check: failure here is most often a network outage.
284            // Surface it rather than silently serving a possibly-stale cache;
285            // offline callers should use `offline()` instead of `ensure_ready`.
286            if is_stale(entry.url, &meta)? {
287                eprintln!("Updating {} (remote changed)...", entry.filename);
288                download(entry, &path, &meta_path)?;
289            }
290        }
291
292        Ok(self.paths())
293    }
294
295    /// Remove the data directory and all its contents.
296    pub fn clean(&self) -> Result<(), DataError> {
297        if self.data_dir.exists() {
298            fs::remove_dir_all(&self.data_dir).map_err(DataError::Io)?;
299        }
300        Ok(())
301    }
302}
303
304// ─── Private helpers ────────────────────────────────────────────────────────
305
306fn is_stale(url: &str, meta: &FileMeta) -> Result<bool, DataError> {
307    let response = ureq::head(url)
308        .call()
309        .map_err(|e| DataError::Http(format!("HEAD {url}: {e}")))?;
310
311    let remote_length: Option<u64> = response
312        .headers()
313        .get("content-length")
314        .and_then(|v| v.to_str().ok())
315        .and_then(|v| v.parse().ok());
316
317    let remote_modified: Option<&str> = response
318        .headers()
319        .get("last-modified")
320        .and_then(|v| v.to_str().ok());
321
322    if let (Some(remote), Some(local)) = (remote_length, meta.content_length) {
323        if remote != local {
324            return Ok(true);
325        }
326    }
327
328    if let (Some(remote), Some(local)) = (remote_modified, meta.last_modified.as_deref()) {
329        if remote != local {
330            return Ok(true);
331        }
332    }
333
334    Ok(false)
335}
336
337fn download(entry: &KernelEntry, path: &Path, meta_path: &Path) -> Result<(), DataError> {
338    let response = ureq::get(entry.url)
339        .call()
340        .map_err(|e| DataError::Http(format!("GET {}: {e}", entry.url)))?;
341
342    let content_length: Option<u64> = response
343        .headers()
344        .get("content-length")
345        .and_then(|v| v.to_str().ok())
346        .and_then(|v| v.parse().ok());
347
348    let last_modified: Option<String> = response
349        .headers()
350        .get("last-modified")
351        .and_then(|v| v.to_str().ok())
352        .map(|v| v.to_string());
353
354    if let Some(size) = content_length {
355        eprintln!("  {} ({:.1} MB)", entry.filename, size as f64 / 1_048_576.0);
356    }
357
358    let tmp_path = path.with_extension("tmp");
359    {
360        let mut body = response.into_body();
361        let file = File::create(&tmp_path).map_err(DataError::Io)?;
362        let mut writer = BufWriter::new(file);
363        if entry.gzipped {
364            let mut decoder = flate2::read::GzDecoder::new(body.as_reader());
365            io::copy(&mut decoder, &mut writer).map_err(DataError::Io)?;
366        } else {
367            io::copy(&mut body.as_reader(), &mut writer).map_err(DataError::Io)?;
368        }
369        writer.flush().map_err(DataError::Io)?;
370    }
371
372    let md5_hex = compute_md5(&tmp_path)?;
373
374    fs::rename(&tmp_path, path).map_err(DataError::Io)?;
375
376    let now = std::time::SystemTime::now()
377        .duration_since(std::time::UNIX_EPOCH)
378        .unwrap()
379        .as_secs();
380    let meta = FileMeta {
381        url: entry.url.to_string(),
382        downloaded_at: now,
383        content_length,
384        last_modified,
385        md5: md5_hex,
386    };
387    let json =
388        serde_json::to_string_pretty(&meta).map_err(|e| DataError::Io(io::Error::other(e)))?;
389    fs::write(meta_path, json).map_err(DataError::Io)?;
390
391    Ok(())
392}
393
394fn read_meta(path: &Path) -> Result<FileMeta, DataError> {
395    let content = fs::read_to_string(path).map_err(DataError::Io)?;
396    serde_json::from_str(&content)
397        .map_err(|e| DataError::Io(io::Error::new(io::ErrorKind::InvalidData, e)))
398}
399
400/// Compare the MD5 of `path` against `expected_hex`. Returns `Ok(true)` on
401/// match. Skips the check (returns `Ok(true)`) when the sidecar MD5 is empty,
402/// which covers legacy sidecars written before MD5 was recorded.
403fn local_md5_matches(path: &Path, expected_hex: &str) -> Result<bool, DataError> {
404    if expected_hex.is_empty() {
405        return Ok(true);
406    }
407    let actual = compute_md5(path)?;
408    Ok(actual.eq_ignore_ascii_case(expected_hex))
409}
410
411fn compute_md5(path: &Path) -> Result<String, DataError> {
412    let mut file = File::open(path).map_err(DataError::Io)?;
413    let mut context = md5::Context::new();
414    let mut buffer = [0u8; 65536];
415    loop {
416        let n = file.read(&mut buffer).map_err(DataError::Io)?;
417        if n == 0 {
418            break;
419        }
420        context.consume(&buffer[..n]);
421    }
422    Ok(format!("{:x}", context.compute()))
423}
424
425#[cfg(test)]
426mod tests {
427    use super::*;
428
429    /// Known-answer MD5 tests from RFC 1321 (plus the empty string) so the
430    /// hash agrees with what the rest of the world calls MD5.
431    #[test]
432    fn compute_md5_matches_rfc1321_vectors() {
433        let dir = tempfile::tempdir().unwrap();
434        let cases: &[(&[u8], &str)] = &[
435            (b"", "d41d8cd98f00b204e9800998ecf8427e"),
436            (b"abc", "900150983cd24fb0d6963f7d28e17f72"),
437            (
438                b"The quick brown fox jumps over the lazy dog",
439                "9e107d9d372bb6826bd81d3542a419d6",
440            ),
441        ];
442        for (i, (payload, expected)) in cases.iter().enumerate() {
443            let path = dir.path().join(format!("case_{i}.bin"));
444            fs::write(&path, payload).unwrap();
445            let got = compute_md5(&path).unwrap();
446            assert_eq!(
447                got,
448                *expected,
449                "case {i}: {:?}",
450                std::str::from_utf8(payload)
451            );
452        }
453    }
454
455    #[test]
456    fn local_md5_matches_detects_correct_and_incorrect_hashes() {
457        let dir = tempfile::tempdir().unwrap();
458        let path = dir.path().join("payload.txt");
459        fs::write(&path, b"hello").unwrap();
460        let actual = compute_md5(&path).unwrap();
461
462        // Exact match.
463        assert!(local_md5_matches(&path, &actual).unwrap());
464        // Case-insensitive match.
465        assert!(local_md5_matches(&path, &actual.to_uppercase()).unwrap());
466        // Mismatch.
467        assert!(!local_md5_matches(&path, "0".repeat(32).as_str()).unwrap());
468    }
469
470    #[test]
471    fn local_md5_matches_skips_check_when_sidecar_has_empty_hash() {
472        // Legacy sidecars written before MD5 was recorded will have md5 = "";
473        // the helper must not refuse to validate them (we'd re-download every
474        // start), nor error on the missing file check.
475        let dir = tempfile::tempdir().unwrap();
476        let nonexistent = dir.path().join("not_there.bin");
477        assert!(local_md5_matches(&nonexistent, "").unwrap());
478    }
479
480    /// When a sidecar declares a real MD5 but the underlying file is
481    /// missing/unreadable, the helper must propagate the I/O error rather
482    /// than silently returning `false` (which would trigger a re-download)
483    /// or `true` (which would mask the corruption). `ensure_ready` relies on
484    /// this to surface unverifiable cache entries to the caller instead of
485    /// the previous `eprintln!` + continue behaviour.
486    #[test]
487    fn local_md5_matches_propagates_io_error_on_missing_file() {
488        let dir = tempfile::tempdir().unwrap();
489        let missing = dir.path().join("absent.bin");
490        let err = local_md5_matches(&missing, "deadbeef").unwrap_err();
491        assert!(
492            matches!(err, DataError::Io(_)),
493            "expected DataError::Io, got {err:?}"
494        );
495    }
496
497    /// `read_meta` on a missing sidecar must error — this is the upstream
498    /// of `ensure_ready`'s integrity check. (`ensure_ready` itself treats a
499    /// missing sidecar separately: it's an explicit `let Ok(meta) = ... else
500    /// continue` branch, not an error path.)
501    #[test]
502    fn read_meta_errors_on_missing_file() {
503        let dir = tempfile::tempdir().unwrap();
504        let missing = dir.path().join("kernel.meta.json");
505        let err = read_meta(&missing).unwrap_err();
506        assert!(matches!(err, DataError::Io(_)));
507    }
508
509    /// Network failures during the staleness HEAD must propagate as
510    /// `DataError::Http` rather than the prior `eprintln!` + continue.
511    /// Uses an RFC 6761 `.invalid` TLD which is guaranteed never to
512    /// resolve, so the call fails fast at DNS without hitting any real
513    /// server or waiting for a TCP timeout.
514    #[test]
515    fn is_stale_propagates_http_error_on_unreachable_host() {
516        let url = "http://nx.invalid/never-resolves";
517        let meta = FileMeta {
518            url: url.into(),
519            downloaded_at: 0,
520            content_length: Some(1),
521            last_modified: None,
522            md5: String::new(),
523        };
524        let err = is_stale(url, &meta).unwrap_err();
525        assert!(
526            matches!(err, DataError::Http(_)),
527            "expected DataError::Http, got {err:?}"
528        );
529    }
530
531    /// Every kernel declared in `DEFAULT_KERNELS` must be reachable through
532    /// `AssistDataPaths`; otherwise a download happens but the path is never
533    /// returned to the caller, creating orphan files in the cache dir.
534    #[test]
535    fn every_default_kernel_has_a_path_field() {
536        let dm = DataManager::with_dir("/tmp/check");
537        let paths = dm.paths();
538        let all_paths = [
539            &paths.planets,
540            &paths.asteroids,
541            &paths.obscodes,
542            &paths.eop_high_prec,
543            &paths.eop_historical,
544            &paths.eop_predict,
545        ];
546        for entry in DEFAULT_KERNELS {
547            let expected = dm.data_dir.join(entry.filename);
548            assert!(
549                all_paths.iter().any(|p| **p == expected),
550                "kernel {:?} in DEFAULT_KERNELS has no corresponding field in AssistDataPaths",
551                entry.filename
552            );
553        }
554        // And the reverse: every returned path must correspond to a declared
555        // kernel (guards against dangling fields).
556        for p in all_paths {
557            let filename = p.file_name().unwrap().to_str().unwrap();
558            assert!(
559                DEFAULT_KERNELS.iter().any(|e| e.filename == filename),
560                "AssistDataPaths field points at {filename:?}, which is not in DEFAULT_KERNELS"
561            );
562        }
563    }
564
565    #[test]
566    fn eop_kernels_returns_spice_idiomatic_load_order() {
567        // predict → historical → current, so the high-precision kernel wins
568        // at epochs it covers (last-in-wins).
569        let dm = DataManager::with_dir("/tmp/check");
570        let paths = dm.paths();
571        let kernels = paths.eop_kernels();
572        assert_eq!(kernels[0], &paths.eop_predict);
573        assert_eq!(kernels[1], &paths.eop_historical);
574        assert_eq!(kernels[2], &paths.eop_high_prec);
575    }
576
577    #[test]
578    fn meta_round_trips_through_sidecar() {
579        // Writing the sidecar and reading it back must preserve every field
580        // the staleness check relies on (content_length, last_modified, md5).
581        let dir = tempfile::tempdir().unwrap();
582        let meta_path = dir.path().join("kernel.meta.json");
583        let meta = FileMeta {
584            url: "https://example.com/kernel.bsp".into(),
585            downloaded_at: 1_700_000_000,
586            content_length: Some(42),
587            last_modified: Some("Mon, 21 Oct 2024 12:00:00 GMT".into()),
588            md5: "d41d8cd98f00b204e9800998ecf8427e".into(),
589        };
590        let json = serde_json::to_string_pretty(&meta).unwrap();
591        fs::write(&meta_path, json).unwrap();
592
593        let back = read_meta(&meta_path).unwrap();
594        assert_eq!(back.url, meta.url);
595        assert_eq!(back.downloaded_at, meta.downloaded_at);
596        assert_eq!(back.content_length, meta.content_length);
597        assert_eq!(back.last_modified, meta.last_modified);
598        assert_eq!(back.md5, meta.md5);
599    }
600}