Skip to main content

rover/extractor/
output.rs

1//! Output paths for table CSVs and downloaded images.
2
3use std::path::{Path, PathBuf};
4
5use sha2::{Digest, Sha256};
6use url::Url;
7
8use crate::extractor::pipeline::ExtractorError;
9
10#[derive(Debug, Clone)]
11pub struct OutputPaths {
12    root: PathBuf,
13}
14
15impl OutputPaths {
16    /// Resolve the output root. Precedence: `ROVER_OUTPUT_DIR` env var,
17    /// then the supplied path (if `Some`), then `dirs::data_local_dir()
18    /// .join("rover").join("output")`. Creates the root if missing.
19    pub fn resolve(configured: Option<&Path>) -> Result<Self, ExtractorError> {
20        let root: PathBuf = if let Ok(env_dir) = std::env::var("ROVER_OUTPUT_DIR") {
21            PathBuf::from(env_dir)
22        } else if let Some(p) = configured {
23            p.to_path_buf()
24        } else {
25            crate::paths::data_dir().join("output")
26        };
27        std::fs::create_dir_all(&root).map_err(|source| ExtractorError::Output {
28            path: root.display().to_string(),
29            source,
30        })?;
31        Ok(Self { root })
32    }
33
34    pub fn root(&self) -> &Path {
35        &self.root
36    }
37
38    pub fn table_path(&self, url: &Url, table_ordinal: usize) -> PathBuf {
39        let host = url.host_str().unwrap_or("unknown");
40        let key = format!("{}#{}", url.as_str(), table_ordinal);
41        self.root
42            .join("tables")
43            .join(host)
44            .join(format!("{}.csv", sha8(&key)))
45    }
46
47    pub fn image_path(&self, url: &Url, ext: &str) -> PathBuf {
48        let host = url.host_str().unwrap_or("unknown");
49        let ext = ext.trim_start_matches('.');
50        let ext = if ext.is_empty() { "bin" } else { ext };
51        self.root
52            .join("images")
53            .join(host)
54            .join(format!("{}.{ext}", sha8(url.as_str())))
55    }
56}
57
58pub fn sha8(input: &str) -> String {
59    let mut h = Sha256::new();
60    h.update(input.as_bytes());
61    let out = h.finalize();
62    out.iter()
63        .take(4)
64        .fold(String::with_capacity(8), |mut s, b| {
65            s.push_str(&format!("{b:02x}"));
66            s
67        })
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73    use crate::extractor::OUTPUT_DIR_TEST_MUTEX as TEST_MUTEX;
74
75    fn url() -> Url {
76        Url::parse("https://example.com/article").unwrap()
77    }
78
79    #[test]
80    fn sha8_is_deterministic_and_eight_chars() {
81        let a = sha8("https://example.com/x");
82        let b = sha8("https://example.com/x");
83        assert_eq!(a, b);
84        assert_eq!(a.len(), 8);
85    }
86
87    #[test]
88    fn table_path_includes_ordinal() {
89        let _guard = TEST_MUTEX.lock().unwrap_or_else(|e| e.into_inner());
90        let tmp = tempfile::tempdir().unwrap();
91        // SAFETY: env access is serialized by TEST_MUTEX above.
92        unsafe { std::env::set_var("ROVER_OUTPUT_DIR", tmp.path()) };
93        let paths = OutputPaths::resolve(None).unwrap();
94        let p0 = paths.table_path(&url(), 0);
95        let p1 = paths.table_path(&url(), 1);
96        assert_ne!(p0, p1);
97        assert!(p0.to_string_lossy().ends_with(".csv"));
98        assert!(p0.to_string_lossy().contains("example.com"));
99        unsafe { std::env::remove_var("ROVER_OUTPUT_DIR") };
100    }
101
102    #[test]
103    fn image_path_uses_sha8_of_url_and_ext() {
104        let _guard = TEST_MUTEX.lock().unwrap_or_else(|e| e.into_inner());
105        let tmp = tempfile::tempdir().unwrap();
106        unsafe { std::env::set_var("ROVER_OUTPUT_DIR", tmp.path()) };
107        let paths = OutputPaths::resolve(None).unwrap();
108        let p = paths.image_path(&Url::parse("https://x/img.png").unwrap(), "png");
109        assert!(p.to_string_lossy().ends_with(".png"));
110        assert!(p.to_string_lossy().contains(&sha8("https://x/img.png")));
111        unsafe { std::env::remove_var("ROVER_OUTPUT_DIR") };
112    }
113
114    #[test]
115    fn resolve_honors_env_then_config_then_default() {
116        let _guard = TEST_MUTEX.lock().unwrap_or_else(|e| e.into_inner());
117        let tmp = tempfile::tempdir().unwrap();
118        unsafe { std::env::set_var("ROVER_OUTPUT_DIR", tmp.path()) };
119        let p = OutputPaths::resolve(Some(Path::new("/ignored"))).unwrap();
120        assert_eq!(p.root, tmp.path());
121        unsafe { std::env::remove_var("ROVER_OUTPUT_DIR") };
122
123        let tmp2 = tempfile::tempdir().unwrap();
124        let p2 = OutputPaths::resolve(Some(tmp2.path())).unwrap();
125        assert_eq!(p2.root, tmp2.path());
126    }
127}