1use std::path::Path;
2
3use anyhow::{Context, Result};
4use async_trait::async_trait;
5use serde::Serialize;
6use tokio::fs;
7
8use argus_common::{CrawlJob, FetchResult};
9
10use crate::storage_trait::{url_to_fragment, Storage};
11
12#[derive(Serialize)]
13struct PageMeta {
14 url: String,
15 final_url: String,
16 status: u16,
17 content_type: Option<String>,
18 depth: u16,
19 body_path: String,
20 fetched_at_ms: u64,
21}
22
23#[derive(Clone)]
26pub struct FileStorage {
27 base_path: std::path::PathBuf,
28}
29
30impl FileStorage {
31 pub fn new<P: AsRef<Path>>(base_path: P) -> Self {
32 Self {
33 base_path: base_path.as_ref().to_path_buf(),
34 }
35 }
36
37 pub async fn ensure_dirs(&self) -> Result<()> {
38 let page_dir = self.base_path.join("page");
39 let body_dir = self.base_path.join("body");
40 fs::create_dir_all(&page_dir)
41 .await
42 .context("create page dir")?;
43 fs::create_dir_all(&body_dir)
44 .await
45 .context("create body dir")?;
46 Ok(())
47 }
48}
49
50#[async_trait]
51impl Storage for FileStorage {
52 async fn record_fetch(&self, job: &CrawlJob, result: &FetchResult) -> Result<()> {
53 self.ensure_dirs().await?;
54
55 let fragment = url_to_fragment(&job.normalized_url);
56 let body_path = format!("body/{}.bin", fragment);
57 let body_full = self.base_path.join(&body_path);
58 let meta_path = self
59 .base_path
60 .join("page")
61 .join(format!("{}.json", fragment));
62
63 fs::write(&body_full, &result.body)
64 .await
65 .context("write body file")?;
66
67 let meta = PageMeta {
68 url: job.url.clone(),
69 final_url: result.final_url.clone(),
70 status: result.status,
71 content_type: result.content_type.clone(),
72 depth: job.depth,
73 body_path,
74 fetched_at_ms: std::time::SystemTime::now()
75 .duration_since(std::time::UNIX_EPOCH)
76 .unwrap_or_default()
77 .as_millis() as u64,
78 };
79
80 let json = serde_json::to_string_pretty(&meta).context("serialize meta")?;
81 fs::write(&meta_path, json.as_bytes())
82 .await
83 .context("write meta file")?;
84
85 Ok(())
86 }
87}
88
89#[cfg(test)]
90mod tests {
91 use std::path::PathBuf;
92
93 use argus_common::{CrawlJob, FetchResult};
94
95 use super::*;
96
97 fn temp_dir() -> PathBuf {
98 use std::sync::atomic::{AtomicU64, Ordering};
99 static COUNTER: AtomicU64 = AtomicU64::new(0);
100 let n = COUNTER.fetch_add(1, Ordering::SeqCst);
101 std::env::temp_dir().join(format!("argus-storage-test-{}-{}", std::process::id(), n))
102 }
103
104 #[tokio::test]
105 async fn record_fetch_creates_page_and_body_files() {
106 let base = temp_dir();
107 let _ = std::fs::remove_dir_all(&base);
108 let storage = FileStorage::new(&base);
109 storage.ensure_dirs().await.unwrap();
110
111 let job = CrawlJob {
112 url: "https://example.com/".to_string(),
113 normalized_url: "https://example.com/".to_string(),
114 host: "example.com".to_string(),
115 depth: 0,
116 };
117 let result = FetchResult {
118 url: job.url.clone(),
119 final_url: "https://example.com/".to_string(),
120 status: 200,
121 content_type: Some("text/html".to_string()),
122 body: bytes::Bytes::from_static(b"<html>body</html>"),
123 };
124
125 storage.record_fetch(&job, &result).await.unwrap();
126
127 let fragment = crate::storage_trait::url_to_fragment(&job.normalized_url);
128 let meta_path = base.join("page").join(format!("{}.json", fragment));
129 let body_path = base.join("body").join(format!("{}.bin", fragment));
130
131 assert!(meta_path.exists(), "metadata file should exist");
132 assert!(body_path.exists(), "body file should exist");
133
134 let meta_json = std::fs::read_to_string(&meta_path).unwrap();
135 let meta: serde_json::Value = serde_json::from_str(&meta_json).expect("valid JSON");
136 assert_eq!(
137 meta.get("url").and_then(|v| v.as_str()),
138 Some("https://example.com/")
139 );
140 assert_eq!(
141 meta.get("final_url").and_then(|v| v.as_str()),
142 Some("https://example.com/")
143 );
144 assert_eq!(meta.get("status").and_then(|v| v.as_u64()), Some(200));
145
146 let body = std::fs::read(&body_path).unwrap();
147 assert_eq!(body, b"<html>body</html>");
148
149 let _ = std::fs::remove_dir_all(&base);
150 }
151}