Skip to main content

argus_storage/
file.rs

1use std::path::Path;
2
3use anyhow::{Context, Result};
4use async_trait::async_trait;
5use serde::Serialize;
6use tokio::fs;
7
8use argus_common::{CrawlJob, FetchResult};
9
10use crate::storage_trait::{url_to_fragment, Storage};
11
12#[derive(Serialize)]
13struct PageMeta {
14    url: String,
15    final_url: String,
16    status: u16,
17    content_type: Option<String>,
18    depth: u16,
19    body_path: String,
20    fetched_at_ms: u64,
21}
22
23/// Writes each fetch to a directory: `base_dir/page/<fragment>.json` (metadata) and
24/// `base_dir/body/<fragment>.bin` (raw body).
25#[derive(Clone)]
26pub struct FileStorage {
27    base_path: std::path::PathBuf,
28}
29
30impl FileStorage {
31    pub fn new<P: AsRef<Path>>(base_path: P) -> Self {
32        Self {
33            base_path: base_path.as_ref().to_path_buf(),
34        }
35    }
36
37    pub async fn ensure_dirs(&self) -> Result<()> {
38        let page_dir = self.base_path.join("page");
39        let body_dir = self.base_path.join("body");
40        fs::create_dir_all(&page_dir)
41            .await
42            .context("create page dir")?;
43        fs::create_dir_all(&body_dir)
44            .await
45            .context("create body dir")?;
46        Ok(())
47    }
48}
49
50#[async_trait]
51impl Storage for FileStorage {
52    async fn record_fetch(&self, job: &CrawlJob, result: &FetchResult) -> Result<()> {
53        self.ensure_dirs().await?;
54
55        let fragment = url_to_fragment(&job.normalized_url);
56        let body_path = format!("body/{}.bin", fragment);
57        let body_full = self.base_path.join(&body_path);
58        let meta_path = self
59            .base_path
60            .join("page")
61            .join(format!("{}.json", fragment));
62
63        fs::write(&body_full, &result.body)
64            .await
65            .context("write body file")?;
66
67        let meta = PageMeta {
68            url: job.url.clone(),
69            final_url: result.final_url.clone(),
70            status: result.status,
71            content_type: result.content_type.clone(),
72            depth: job.depth,
73            body_path,
74            fetched_at_ms: std::time::SystemTime::now()
75                .duration_since(std::time::UNIX_EPOCH)
76                .unwrap_or_default()
77                .as_millis() as u64,
78        };
79
80        let json = serde_json::to_string_pretty(&meta).context("serialize meta")?;
81        fs::write(&meta_path, json.as_bytes())
82            .await
83            .context("write meta file")?;
84
85        Ok(())
86    }
87}
88
89#[cfg(test)]
90mod tests {
91    use std::path::PathBuf;
92
93    use argus_common::{CrawlJob, FetchResult};
94
95    use super::*;
96
97    fn temp_dir() -> PathBuf {
98        use std::sync::atomic::{AtomicU64, Ordering};
99        static COUNTER: AtomicU64 = AtomicU64::new(0);
100        let n = COUNTER.fetch_add(1, Ordering::SeqCst);
101        std::env::temp_dir().join(format!("argus-storage-test-{}-{}", std::process::id(), n))
102    }
103
104    #[tokio::test]
105    async fn record_fetch_creates_page_and_body_files() {
106        let base = temp_dir();
107        let _ = std::fs::remove_dir_all(&base);
108        let storage = FileStorage::new(&base);
109        storage.ensure_dirs().await.unwrap();
110
111        let job = CrawlJob {
112            url: "https://example.com/".to_string(),
113            normalized_url: "https://example.com/".to_string(),
114            host: "example.com".to_string(),
115            depth: 0,
116        };
117        let result = FetchResult {
118            url: job.url.clone(),
119            final_url: "https://example.com/".to_string(),
120            status: 200,
121            content_type: Some("text/html".to_string()),
122            body: bytes::Bytes::from_static(b"<html>body</html>"),
123        };
124
125        storage.record_fetch(&job, &result).await.unwrap();
126
127        let fragment = crate::storage_trait::url_to_fragment(&job.normalized_url);
128        let meta_path = base.join("page").join(format!("{}.json", fragment));
129        let body_path = base.join("body").join(format!("{}.bin", fragment));
130
131        assert!(meta_path.exists(), "metadata file should exist");
132        assert!(body_path.exists(), "body file should exist");
133
134        let meta_json = std::fs::read_to_string(&meta_path).unwrap();
135        let meta: serde_json::Value = serde_json::from_str(&meta_json).expect("valid JSON");
136        assert_eq!(
137            meta.get("url").and_then(|v| v.as_str()),
138            Some("https://example.com/")
139        );
140        assert_eq!(
141            meta.get("final_url").and_then(|v| v.as_str()),
142            Some("https://example.com/")
143        );
144        assert_eq!(meta.get("status").and_then(|v| v.as_u64()), Some(200));
145
146        let body = std::fs::read(&body_path).unwrap();
147        assert_eq!(body, b"<html>body</html>");
148
149        let _ = std::fs::remove_dir_all(&base);
150    }
151}