Skip to main content

ppt_rs/web2ppt/
fetcher.rs

1//! Web page fetcher for Web2PPT
2
3use super::{Web2PptError, Result, Web2PptConfig};
4use reqwest::blocking::Client;
5use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, ACCEPT_LANGUAGE, CACHE_CONTROL};
6use std::time::Duration;
7
8/// Fetches web pages
9pub struct WebFetcher {
10    client: Client,
11    config: Web2PptConfig,
12}
13
14impl WebFetcher {
15    /// Create a new web fetcher with default config
16    pub fn new() -> Result<Self> {
17        Self::with_config(Web2PptConfig::default())
18    }
19
20    /// Create a new web fetcher with custom config
21    pub fn with_config(config: Web2PptConfig) -> Result<Self> {
22        // Build headers to look like a real browser
23        let mut headers = HeaderMap::new();
24        headers.insert(ACCEPT, HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"));
25        headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.5"));
26        headers.insert(CACHE_CONTROL, HeaderValue::from_static("no-cache"));
27        
28        let client = Client::builder()
29            .user_agent(&config.user_agent)
30            .timeout(Duration::from_secs(config.timeout_secs))
31            .default_headers(headers)
32            .redirect(reqwest::redirect::Policy::limited(10))
33            .build()
34            .map_err(|e| Web2PptError::FetchError(e.to_string()))?;
35
36        Ok(WebFetcher { client, config })
37    }
38
39    /// Fetch HTML content from a URL
40    pub fn fetch(&self, url: &str) -> Result<String> {
41        // Validate URL
42        let parsed_url = url::Url::parse(url)
43            .map_err(|e| Web2PptError::InvalidUrl(e.to_string()))?;
44
45        // Only allow http/https
46        if parsed_url.scheme() != "http" && parsed_url.scheme() != "https" {
47            return Err(Web2PptError::InvalidUrl(
48                "Only HTTP and HTTPS URLs are supported".to_string()
49            ));
50        }
51
52        // Fetch the page with referer header
53        let response = self.client
54            .get(url)
55            .header("Referer", url)
56            .send()
57            .map_err(|e| Web2PptError::FetchError(e.to_string()))?;
58
59        // Check status
60        if !response.status().is_success() {
61            return Err(Web2PptError::FetchError(
62                format!("HTTP {}: {}", response.status().as_u16(), response.status().as_str())
63            ));
64        }
65
66        // Get text content
67        response.text()
68            .map_err(|e| Web2PptError::FetchError(e.to_string()))
69    }
70
71    /// Fetch and return both URL and HTML
72    pub fn fetch_with_url(&self, url: &str) -> Result<(String, String)> {
73        let html = self.fetch(url)?;
74        Ok((url.to_string(), html))
75    }
76
77    /// Get the config
78    pub fn config(&self) -> &Web2PptConfig {
79        &self.config
80    }
81}
82
83impl Default for WebFetcher {
84    fn default() -> Self {
85        Self::new().expect("Failed to create default WebFetcher")
86    }
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92
93    #[test]
94    fn test_invalid_url() {
95        let fetcher = WebFetcher::new().unwrap();
96        let result = fetcher.fetch("not-a-url");
97        assert!(result.is_err());
98    }
99
100    #[test]
101    fn test_invalid_scheme() {
102        let fetcher = WebFetcher::new().unwrap();
103        let result = fetcher.fetch("ftp://example.com");
104        assert!(result.is_err());
105    }
106
107    #[test]
108    fn test_config() {
109        let config = Web2PptConfig::new().timeout(60);
110        let fetcher = WebFetcher::with_config(config).unwrap();
111        assert_eq!(fetcher.config().timeout_secs, 60);
112    }
113}