uiuifree_crawl_cache/
lib.rs

1pub(crate) mod error;
2
3use reqwest::Client;
4use std::fs;
5use std::fs::File;
6use std::io::Write;
7use std::path::Path;
8use std::time::Duration;
9
10pub use error::CrawlCacheError;
11
12pub struct CrawlCache {
13    user_agent: String,
14    duration: Option<Duration>,
15    timeout: Option<Duration>,
16}
17
18impl CrawlCache {
19    pub fn new() -> Self {
20        CrawlCache {
21            user_agent: "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".to_string(),
22            duration: None,
23            timeout: None,
24        }
25    }
26    pub fn set_user_agent(mut self, user_agent: String) -> Self {
27        self.user_agent = user_agent;
28        self
29    }
30    pub fn set_duration(mut self, duration: Duration) -> Self {
31        self.duration = Some(duration);
32        self
33    }
34    pub fn set_timeout(mut self, timeout: Duration) -> Self {
35        self.timeout = Some(timeout);
36        self
37    }
38    pub async fn get_content(&self, url: &str) -> Result<String, CrawlCacheError> {
39        let client = self.client()?;
40        let content = match client.get(url).send().await {
41            Ok(v) => v.text().await,
42            Err(e) => {
43                return Err(CrawlCacheError::Client(e.to_string()));
44            }
45        };
46        match content {
47            Ok(v) => Ok(v),
48            Err(e) => Err(CrawlCacheError::Client(e.to_string())),
49        }
50    }
51    pub async fn get_content_or_cache(
52        &self,
53        url: &str,
54        cache_path: &str,
55    ) -> Result<String, CrawlCacheError> {
56        let path = Path::new(cache_path);
57        if let Some(cache) = Self::get_cache(cache_path) {
58            return Ok(cache);
59        }
60
61        let dir_path = path.parent().unwrap().display().to_string();
62        if !Path::new(dir_path.as_str()).is_dir() {
63            fs::create_dir_all(dir_path.as_str()).unwrap();
64        }
65
66        let content = self.get_content(url).await?;
67        let mut file = File::create(cache_path).unwrap();
68        file.write(content.as_bytes()).unwrap();
69        match file.flush() {
70            Ok(_) => {}
71            Err(_) => {}
72        };
73        if let Some(ref duration) = self.duration {
74            tokio::time::sleep(duration.clone()).await;
75        }
76        Ok(content)
77    }
78
79    pub fn get_cache(cache_path: &str) -> Option<String> {
80        let path = Path::new(cache_path);
81        if path.is_file() {
82            return match std::fs::read_to_string(cache_path) {
83                Ok(v) => Some(v),
84                Err(_) => None,
85            };
86        }
87        None
88    }
89    pub fn remove_cache(cache_path: &str) -> bool {
90        let path = Path::new(cache_path);
91        if path.is_file() {
92            return std::fs::remove_file(cache_path).is_ok();
93        }
94        true
95    }
96
97    fn client(&self) -> Result<Client, CrawlCacheError> {
98        let mut res = reqwest::ClientBuilder::new().user_agent(&self.user_agent);
99        if let Some(timeout) = &self.timeout {
100            res = res.timeout(timeout.clone())
101        }
102
103        match res.build() {
104            Ok(v) => Ok(v),
105            Err(e) => Err(CrawlCacheError::Client(e.to_string())),
106        }
107    }
108}
109
110#[tokio::test]
111async fn test() {
112    let cache = CrawlCache::new();
113    let a = cache
114        .get_content_or_storage("https://www.yahoo.co.jp/", "./yahoo.co.jp/index.html")
115        .await;
116    assert!(a.is_ok())
117}