rover/extractor/
output.rs1use std::path::{Path, PathBuf};
4
5use sha2::{Digest, Sha256};
6use url::Url;
7
8use crate::extractor::pipeline::ExtractorError;
9
10#[derive(Debug, Clone)]
11pub struct OutputPaths {
12 root: PathBuf,
13}
14
15impl OutputPaths {
16 pub fn resolve(configured: Option<&Path>) -> Result<Self, ExtractorError> {
20 let root: PathBuf = if let Ok(env_dir) = std::env::var("ROVER_OUTPUT_DIR") {
21 PathBuf::from(env_dir)
22 } else if let Some(p) = configured {
23 p.to_path_buf()
24 } else {
25 crate::paths::data_dir().join("output")
26 };
27 std::fs::create_dir_all(&root).map_err(|source| ExtractorError::Output {
28 path: root.display().to_string(),
29 source,
30 })?;
31 Ok(Self { root })
32 }
33
34 pub fn root(&self) -> &Path {
35 &self.root
36 }
37
38 pub fn table_path(&self, url: &Url, table_ordinal: usize) -> PathBuf {
39 let host = url.host_str().unwrap_or("unknown");
40 let key = format!("{}#{}", url.as_str(), table_ordinal);
41 self.root
42 .join("tables")
43 .join(host)
44 .join(format!("{}.csv", sha8(&key)))
45 }
46
47 pub fn image_path(&self, url: &Url, ext: &str) -> PathBuf {
48 let host = url.host_str().unwrap_or("unknown");
49 let ext = ext.trim_start_matches('.');
50 let ext = if ext.is_empty() { "bin" } else { ext };
51 self.root
52 .join("images")
53 .join(host)
54 .join(format!("{}.{ext}", sha8(url.as_str())))
55 }
56}
57
58pub fn sha8(input: &str) -> String {
59 let mut h = Sha256::new();
60 h.update(input.as_bytes());
61 let out = h.finalize();
62 out.iter()
63 .take(4)
64 .fold(String::with_capacity(8), |mut s, b| {
65 s.push_str(&format!("{b:02x}"));
66 s
67 })
68}
69
70#[cfg(test)]
71mod tests {
72 use super::*;
73 use crate::extractor::OUTPUT_DIR_TEST_MUTEX as TEST_MUTEX;
74
75 fn url() -> Url {
76 Url::parse("https://example.com/article").unwrap()
77 }
78
79 #[test]
80 fn sha8_is_deterministic_and_eight_chars() {
81 let a = sha8("https://example.com/x");
82 let b = sha8("https://example.com/x");
83 assert_eq!(a, b);
84 assert_eq!(a.len(), 8);
85 }
86
87 #[test]
88 fn table_path_includes_ordinal() {
89 let _guard = TEST_MUTEX.lock().unwrap_or_else(|e| e.into_inner());
90 let tmp = tempfile::tempdir().unwrap();
91 unsafe { std::env::set_var("ROVER_OUTPUT_DIR", tmp.path()) };
93 let paths = OutputPaths::resolve(None).unwrap();
94 let p0 = paths.table_path(&url(), 0);
95 let p1 = paths.table_path(&url(), 1);
96 assert_ne!(p0, p1);
97 assert!(p0.to_string_lossy().ends_with(".csv"));
98 assert!(p0.to_string_lossy().contains("example.com"));
99 unsafe { std::env::remove_var("ROVER_OUTPUT_DIR") };
100 }
101
102 #[test]
103 fn image_path_uses_sha8_of_url_and_ext() {
104 let _guard = TEST_MUTEX.lock().unwrap_or_else(|e| e.into_inner());
105 let tmp = tempfile::tempdir().unwrap();
106 unsafe { std::env::set_var("ROVER_OUTPUT_DIR", tmp.path()) };
107 let paths = OutputPaths::resolve(None).unwrap();
108 let p = paths.image_path(&Url::parse("https://x/img.png").unwrap(), "png");
109 assert!(p.to_string_lossy().ends_with(".png"));
110 assert!(p.to_string_lossy().contains(&sha8("https://x/img.png")));
111 unsafe { std::env::remove_var("ROVER_OUTPUT_DIR") };
112 }
113
114 #[test]
115 fn resolve_honors_env_then_config_then_default() {
116 let _guard = TEST_MUTEX.lock().unwrap_or_else(|e| e.into_inner());
117 let tmp = tempfile::tempdir().unwrap();
118 unsafe { std::env::set_var("ROVER_OUTPUT_DIR", tmp.path()) };
119 let p = OutputPaths::resolve(Some(Path::new("/ignored"))).unwrap();
120 assert_eq!(p.root, tmp.path());
121 unsafe { std::env::remove_var("ROVER_OUTPUT_DIR") };
122
123 let tmp2 = tempfile::tempdir().unwrap();
124 let p2 = OutputPaths::resolve(Some(tmp2.path())).unwrap();
125 assert_eq!(p2.root, tmp2.path());
126 }
127}