uiuifree_crawl_cache/
lib.rs1pub(crate) mod error;
2
3use reqwest::Client;
4use std::fs;
5use std::fs::File;
6use std::io::Write;
7use std::path::Path;
8use std::time::Duration;
9
10pub use error::CrawlCacheError;
11
12pub struct CrawlCache {
13 user_agent: String,
14 duration: Option<Duration>,
15 timeout: Option<Duration>,
16}
17
18impl CrawlCache {
19 pub fn new() -> Self {
20 CrawlCache {
21 user_agent: "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".to_string(),
22 duration: None,
23 timeout: None,
24 }
25 }
26 pub fn set_user_agent(mut self, user_agent: String) -> Self {
27 self.user_agent = user_agent;
28 self
29 }
30 pub fn set_duration(mut self, duration: Duration) -> Self {
31 self.duration = Some(duration);
32 self
33 }
34 pub fn set_timeout(mut self, timeout: Duration) -> Self {
35 self.timeout = Some(timeout);
36 self
37 }
38 pub async fn get_content(&self, url: &str) -> Result<String, CrawlCacheError> {
39 let client = self.client()?;
40 let content = match client.get(url).send().await {
41 Ok(v) => v.text().await,
42 Err(e) => {
43 return Err(CrawlCacheError::Client(e.to_string()));
44 }
45 };
46 match content {
47 Ok(v) => Ok(v),
48 Err(e) => Err(CrawlCacheError::Client(e.to_string())),
49 }
50 }
51 pub async fn get_content_or_cache(
52 &self,
53 url: &str,
54 cache_path: &str,
55 ) -> Result<String, CrawlCacheError> {
56 let path = Path::new(cache_path);
57 if let Some(cache) = Self::get_cache(cache_path) {
58 return Ok(cache);
59 }
60
61 let dir_path = path.parent().unwrap().display().to_string();
62 if !Path::new(dir_path.as_str()).is_dir() {
63 fs::create_dir_all(dir_path.as_str()).unwrap();
64 }
65
66 let content = self.get_content(url).await?;
67 let mut file = File::create(cache_path).unwrap();
68 file.write(content.as_bytes()).unwrap();
69 match file.flush() {
70 Ok(_) => {}
71 Err(_) => {}
72 };
73 if let Some(ref duration) = self.duration {
74 tokio::time::sleep(duration.clone()).await;
75 }
76 Ok(content)
77 }
78
79 pub fn get_cache(cache_path: &str) -> Option<String> {
80 let path = Path::new(cache_path);
81 if path.is_file() {
82 return match std::fs::read_to_string(cache_path) {
83 Ok(v) => Some(v),
84 Err(_) => None,
85 };
86 }
87 None
88 }
89 pub fn remove_cache(cache_path: &str) -> bool {
90 let path = Path::new(cache_path);
91 if path.is_file() {
92 return std::fs::remove_file(cache_path).is_ok();
93 }
94 true
95 }
96
97 fn client(&self) -> Result<Client, CrawlCacheError> {
98 let mut res = reqwest::ClientBuilder::new().user_agent(&self.user_agent);
99 if let Some(timeout) = &self.timeout {
100 res = res.timeout(timeout.clone())
101 }
102
103 match res.build() {
104 Ok(v) => Ok(v),
105 Err(e) => Err(CrawlCacheError::Client(e.to_string())),
106 }
107 }
108}
109
110#[tokio::test]
111async fn test() {
112 let cache = CrawlCache::new();
113 let a = cache
114 .get_content_or_storage("https://www.yahoo.co.jp/", "./yahoo.co.jp/index.html")
115 .await;
116 assert!(a.is_ok())
117}