url_preview/
fetcher.rs

1use super::is_twitter_url;
2use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
3use crate::PreviewError;
4use reqwest::{header::HeaderMap, Client};
5use scraper::{Html, Selector};
6use serde::Deserialize;
7use std::time::Duration;
8use tracing::{debug, error, instrument, warn};
9
10#[derive(Debug, Clone, Deserialize)]
11pub struct OEmbedResponse {
12    pub html: String,
13    #[serde(default)]
14    pub author_name: String,
15    #[serde(default)]
16    pub author_url: String,
17    pub provider_name: String,
18    pub provider_url: String,
19}
20
21#[derive(Clone)]
22pub struct Fetcher {
23    client: Client,
24}
25
26#[derive(Debug, Clone)]
27pub enum FetchResult {
28    Html(String),
29    OEmbed(OEmbedResponse),
30}
31
32impl Default for Fetcher {
33    fn default() -> Self {
34        Self::new()
35    }
36}
37
38impl Fetcher {
39    pub fn new() -> Self {
40        let user_agent = "url_preview/0.1.0";
41        let timeout = Duration::from_secs(10);
42        debug!("Fetcher initialized with default configuration");
43
44        Self::new_with_custom_config(timeout, user_agent)
45    }
46
47    pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
48        let client = Client::builder()
49            .timeout(timeout)
50            .user_agent(user_agent)
51            .pool_max_idle_per_host(10)
52            .build()
53            .unwrap_or_else(|e| {
54                error!(error = %e, "Failed to create HTTP client");
55                panic!("Failed to initialize HTTP client: {}", e);
56            });
57        Fetcher { client }
58    }
59
60    pub fn with_client(client: Client) -> Self {
61        Self { client }
62    }
63
64    pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
65        let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
66        let results = futures::future::join_all(futures).await;
67
68        let mut responses = Vec::new();
69        for result in results {
70            match result {
71                Ok(response) => responses.push(response),
72                Err(e) => return Err(e),
73            }
74        }
75
76        Ok(responses)
77    }
78
79    #[instrument(level = "debug", skip(self), err)]
80    pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
81        let max_retries = 3;
82        let mut delay = Duration::from_millis(1000);
83
84        for attempt in 0..max_retries {
85            debug!(attempt = attempt + 1, "Attempting to fetch URL");
86
87            match self.client.get(url).send().await {
88                Ok(response) => {
89                    if response.status().is_success() {
90                        debug!(url = %url, "Successfully fetched URL");
91                        return response.text().await.map_err(|e| {
92                            error!(error = %e, "Failed to read response body");
93                            PreviewError::FetchError(e.to_string())
94                        });
95                    }
96
97                    if attempt < max_retries - 1 {
98                        warn!(
99                            status = %response.status(),
100                            attempt = attempt + 1,
101                            "Request failed, retrying after delay"
102                        );
103                        tokio::time::sleep(delay).await;
104                        delay *= 2;
105                        continue;
106                    }
107                }
108                Err(e) => {
109                    if attempt < max_retries - 1 {
110                        warn!(
111                            error = %e,
112                            attempt = attempt + 1,
113                            "Request error, retrying after delay"
114                        );
115                        tokio::time::sleep(delay).await;
116                        delay *= 2;
117                        continue;
118                    }
119                    error!(error = %e, "Max retries exceeded");
120                    return Err(PreviewError::FetchError(e.to_string()));
121                }
122            }
123        }
124
125        error!("Failed to fetch URL after maximum retries");
126        Err(PreviewError::FetchError("Max retries exceeded".to_string()))
127    }
128
129    #[instrument(level = "debug", skip(self), err)]
130    pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
131        debug!(url = %url, "Starting fetch request");
132
133        if is_twitter_url(url) {
134            debug!(url = %url, "Detected Twitter URL, using oEmbed API");
135            let oembed = self.fetch_twitter_oembed(url).await?;
136            Ok(FetchResult::OEmbed(oembed))
137        } else {
138            debug!(url = %url, "Fetching regular webpage");
139            let content = self
140                .client
141                .get(url)
142                .send()
143                .await
144                .map_err(|e| {
145                    error!(error = %e, url = %url, "Failed to send request");
146                    PreviewError::FetchError(e.to_string())
147                })?
148                .text()
149                .await
150                .map_err(|e| {
151                    error!(error = %e, url = %url, "Failed to read response body");
152                    PreviewError::FetchError(e.to_string())
153                })?;
154
155            debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
156            Ok(FetchResult::Html(content))
157        }
158    }
159
160    #[instrument(level = "debug", skip(self), err)]
161    async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
162        let oembed_url = format!(
163            "https://publish.twitter.com/oembed?url={}&omit_script=1&lang=en",
164            tweet_url
165        );
166
167        debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
168
169        let response = self.client.get(&oembed_url).send().await.map_err(|e| {
170            error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
171            PreviewError::ExternalServiceError {
172                service: "Twitter".to_string(),
173                message: e.to_string(),
174            }
175        })?;
176
177        let oembed: OEmbedResponse = response.json().await.map_err(|e| {
178            error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
179            PreviewError::ExternalServiceError {
180                service: "Twitter".to_string(),
181                message: e.to_string(),
182            }
183        })?;
184
185        debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
186        Ok(oembed)
187    }
188}
189
190// for Twitter
191impl Fetcher {
192    #[instrument(level = "debug")]
193    pub fn new_twitter_client() -> Self {
194        debug!("Creating Twitter-specific fetcher");
195
196        let mut headers = HeaderMap::new();
197
198        headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
199        headers.insert(
200            "Accept",
201            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
202                .parse()
203                .unwrap(),
204        );
205
206        headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
207        headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
208        headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
209        headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
210        headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
211
212        headers.insert("Cache-Control", "no-cache".parse().unwrap());
213        headers.insert("Pragma", "no-cache".parse().unwrap());
214
215        let client = Client::builder()
216            .user_agent(
217                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
218                AppleWebKit/537.36 (KHTML, like Gecko) \
219                Chrome/119.0.0.0 Safari/537.36",
220            )
221            .timeout(Duration::from_secs(30))
222            .redirect(reqwest::redirect::Policy::limited(10))
223            .default_headers(headers)
224            .build()
225            .expect("Failed to create Twitter HTTP client");
226
227        debug!("Twitter-specific fetcher created successfully");
228        Self { client }
229    }
230
231    /// Creates a Fetcher with custom configuration
232    /// This method allows users to provide their own configuration options
233    pub fn new_with_config(config: FetcherConfig) -> Self {
234        let mut client_builder = Client::builder()
235            .user_agent(config.user_agent)
236            .timeout(config.timeout);
237
238        // Apply custom headers
239        if let Some(headers) = config.headers {
240            client_builder = client_builder.default_headers(headers);
241        }
242
243        // Apply redirect policy
244        if let Some(redirect_policy) = config.redirect_policy {
245            client_builder = client_builder.redirect(redirect_policy);
246        }
247
248        let client = client_builder
249            .build()
250            .expect("Failed to create HTTP client with custom config");
251
252        Self { client }
253    }
254}
255
256// for GitHub
257impl Fetcher {
258    pub fn new_github_client() -> Self {
259        debug!("Creating GitHub-specific client");
260
261        let mut headers = HeaderMap::new();
262        headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
263
264        if let Ok(token) = std::env::var("GITHUB_TOKEN") {
265            debug!("Found GitHub token in environment");
266            headers.insert(
267                "Authorization",
268                format!("Bearer {}", token).parse().unwrap(),
269            );
270        }
271
272        let client = Client::builder()
273            .user_agent("url_preview/1.0")
274            .default_headers(headers)
275            .timeout(Duration::from_secs(10))
276            .build()
277            .unwrap();
278
279        Self { client }
280    }
281
282    pub async fn fetch_github_repo(
283        &self,
284        owner: &str,
285        repo: &str,
286    ) -> Result<GitHubRepository, PreviewError> {
287        let url = format!("https://api.github.com/repos/{}/{}", owner, repo);
288        debug!(url = %url, "Fetching GitHub repository information");
289
290        let response =
291            self.client.get(&url).send().await.map_err(|e| {
292                PreviewError::FetchError(format!("GitHub API request failed: {}", e))
293            })?;
294
295        if !response.status().is_success() {
296            return Err(PreviewError::FetchError(format!(
297                "GitHub API returned status: {}",
298                response.status()
299            )));
300        }
301
302        response.json::<GitHubRepository>().await.map_err(|e| {
303            PreviewError::ExtractError(format!("Failed to parse GitHub response: {}", e))
304        })
305    }
306}
307
308/// Creates a fetcher with Twitter-specific configurations.
309///
310/// # Examples
311/// ```ignore
312/// let fetcher = Fetcher::new();
313///
314/// // Using Twitter-specific configuration
315/// let twitter_fetcher = Fetcher::new_twitter_client();
316///
317/// // Using custom configuration
318/// let custom_fetcher = Fetcher::new_with_config(FetcherConfig {
319///     user_agent: "my-custom-agent/1.0".to_string(),
320///     timeout: Duration::from_secs(20),
321///     headers: Some(my_custom_headers),
322///     redirect_policy: Some(my_redirect_policy),
323/// });
324/// ```
325pub struct FetcherConfig {
326    pub user_agent: String,
327    pub timeout: Duration,
328    pub headers: Option<HeaderMap>,
329    pub redirect_policy: Option<reqwest::redirect::Policy>,
330}
331
332impl Default for FetcherConfig {
333    fn default() -> Self {
334        Self {
335            user_agent: "url_preview/0.1.0".to_string(),
336            timeout: Duration::from_secs(10),
337            headers: None,
338            redirect_policy: None,
339        }
340    }
341}
342
343// for GitHub
344impl Fetcher {
345    pub async fn fetch_github_basic_preview(
346        &self,
347        owner: &str,
348        repo: &str,
349    ) -> Result<GitHubBasicPreview, PreviewError> {
350        let url = format!("https://github.com/{}/{}", owner, repo);
351        debug!("Fetching basic preview for repository: {}/{}", owner, repo);
352
353        let response =
354            self.client.get(&url).send().await.map_err(|e| {
355                PreviewError::FetchError(format!("Failed to fetch GitHub page: {}", e))
356            })?;
357
358        let html = response.text().await.map_err(|e| {
359            PreviewError::FetchError(format!("Failed to read response body: {}", e))
360        })?;
361
362        let document = Html::parse_document(&html);
363
364        let title = self.extract_title(&document)?;
365        let description = self.extract_description(&document);
366        let image_url = self.extract_og_image(&document);
367
368        if let Some(ref url) = image_url {
369            debug!("Found GitHub Reop Preview Image URL: {}", url);
370        } else {
371            warn!("Not Found GitHub Reop Preview Image URL");
372        }
373
374        Ok(GitHubBasicPreview {
375            title,
376            description,
377            image_url,
378        })
379    }
380
381    pub async fn fetch_github_detailed_info(
382        &self,
383        owner: &str,
384        repo: &str,
385    ) -> Result<GitHubDetailedInfo, PreviewError> {
386        let api_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
387        debug!("Fetching detailed info from GitHub API: {}", api_url);
388
389        let response = self
390            .client
391            .get(&api_url)
392            .header("Accept", "application/vnd.github.v3+json")
393            .send()
394            .await
395            .map_err(|e| PreviewError::FetchError(format!("GitHub API request failed: {}", e)))?;
396
397        let repo_data: serde_json::Value = response.json().await.map_err(|e| {
398            PreviewError::ExtractError(format!("Failed to parse GitHub API response: {}", e))
399        })?;
400
401        let contributors_url = format!("{}/contributors?per_page=1", api_url);
402        let contributors_count = self.get_contributors_count(&contributors_url).await?;
403
404        Ok(GitHubDetailedInfo {
405            stars_count: repo_data["stargazers_count"].as_u64().unwrap_or(0) as u32,
406            forks_count: repo_data["forks_count"].as_u64().unwrap_or(0) as u32,
407            contributors_count,
408            issues_count: repo_data["open_issues_count"].as_u64().unwrap_or(0) as u32,
409            discussions_count: repo_data["discussions_count"].as_u64().unwrap_or(0) as u32,
410            primary_language: repo_data["language"].as_str().map(String::from),
411        })
412    }
413
414    fn extract_title(&self, document: &Html) -> Result<String, PreviewError> {
415        let og_title_selector = Selector::parse("meta[property='og:title']")
416            .map_err(|e| PreviewError::ExtractError(format!("Invalid selector: {}", e)))?;
417
418        document
419            .select(&og_title_selector)
420            .next()
421            .and_then(|el| el.value().attr("content"))
422            .map(String::from)
423            .ok_or_else(|| PreviewError::ExtractError("Title not found".into()))
424    }
425
426    fn extract_description(&self, document: &Html) -> Option<String> {
427        let selector = Selector::parse("meta[property='og:description']").ok()?;
428        document
429            .select(&selector)
430            .next()
431            .and_then(|el| el.value().attr("content"))
432            .map(String::from)
433    }
434
435    fn extract_og_image(&self, document: &Html) -> Option<String> {
436        let twitter_image_selector = Selector::parse("meta[name='twitter:image']").ok()?;
437
438        if let Some(url) = document
439            .select(&twitter_image_selector)
440            .next()
441            .and_then(|el| el.value().attr("content"))
442        {
443            debug!("Found Twitter image URL: {}", url);
444            return Some(url.to_string());
445        }
446
447        // if not found twitter:image,back to find og:image
448        let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
449
450        document
451            .select(&og_image_selector)
452            .next()
453            .and_then(|el| el.value().attr("content"))
454            .map(|url| {
455                debug!("Found Open Graph image URL: {}", url);
456                url.to_string()
457            })
458    }
459
460    async fn get_contributors_count(&self, url: &str) -> Result<u32, PreviewError> {
461        let response = self.client.get(url).send().await.map_err(|e| {
462            PreviewError::FetchError(format!("Failed to fetch contributors: {}", e))
463        })?;
464
465        if let Some(link_header) = response.headers().get("Link") {
466            if let Ok(link_str) = link_header.to_str() {
467                if let Some(last_page) = parse_github_link_header(link_str) {
468                    return Ok(last_page);
469                }
470            }
471        }
472
473        Ok(1)
474    }
475}
476
477fn parse_github_link_header(link_str: &str) -> Option<u32> {
478    // Link Header info:
479    // <https://api.github.com/repos/owner/repo/contributors?page=2>; rel="next",
480    // <https://api.github.com/repos/owner/repo/contributors?page=817>; rel="last"
481
482    for link in link_str.split(',') {
483        if link.contains("rel=\"last\"") {
484            if let Some(page) = link
485                .split(';')
486                .next().map(|url| url.trim_matches(|c| c == '<' || c == '>' || c == ' '))
487                .and_then(|url| url.split('=').last())
488                .and_then(|page| page.parse().ok())
489            {
490                return Some(page);
491            }
492        }
493    }
494    None
495}