url_preview/
fetcher.rs

1use super::is_twitter_url;
2#[cfg(feature = "github")]
3use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
4use crate::PreviewError;
5use reqwest::{header::HeaderMap, Client};
6use scraper::{Html, Selector};
7use serde::Deserialize;
8use std::time::Duration;
9#[cfg(feature = "logging")]
10use tracing::{debug, error, instrument, warn};
11
12#[derive(Debug, Clone, Deserialize)]
13pub struct OEmbedResponse {
14    pub html: String,
15    #[serde(default)]
16    pub author_name: String,
17    #[serde(default)]
18    pub author_url: String,
19    pub provider_name: String,
20    pub provider_url: String,
21}
22
23#[derive(Clone)]
24pub struct Fetcher {
25    client: Client,
26}
27
28#[derive(Debug, Clone)]
29pub enum FetchResult {
30    Html(String),
31    OEmbed(OEmbedResponse),
32}
33
34impl Default for Fetcher {
35    fn default() -> Self {
36        Self::new()
37    }
38}
39
40impl Fetcher {
41    pub fn new() -> Self {
42        let user_agent = "url_preview/0.1.0";
43        let timeout = Duration::from_secs(10);
44        #[cfg(feature = "logging")]
45        debug!("Fetcher initialized with default configuration");
46
47        Self::new_with_custom_config(timeout, user_agent)
48    }
49
50    pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
51        let client = Client::builder()
52            .timeout(timeout)
53            .user_agent(user_agent)
54            .pool_max_idle_per_host(10)
55            .build()
56            .unwrap_or_else(|e| {
57                #[cfg(feature = "logging")]
58                error!(error = %e, "Failed to create HTTP client");
59                panic!("Failed to initialize HTTP client: {e}");
60            });
61        Fetcher { client }
62    }
63
64    pub fn with_client(client: Client) -> Self {
65        Self { client }
66    }
67
68    pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
69        let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
70        let results = futures::future::join_all(futures).await;
71
72        let mut responses = Vec::new();
73        for result in results {
74            match result {
75                Ok(response) => responses.push(response),
76                Err(e) => return Err(e),
77            }
78        }
79
80        Ok(responses)
81    }
82
83    #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
84    pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
85        let max_retries = 3;
86        let mut delay = Duration::from_millis(1000);
87
88        for attempt in 0..max_retries {
89            #[cfg(feature = "logging")]
90            debug!(attempt = attempt + 1, "Attempting to fetch URL");
91
92            match self.client.get(url).send().await {
93                Ok(response) => {
94                    // Check for 404 first
95                    if response.status() == 404 {
96                        return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
97                    }
98
99                    if response.status().is_success() {
100                        #[cfg(feature = "logging")]
101                        debug!(url = %url, "Successfully fetched URL");
102                        return response.text().await.map_err(|e| {
103                            #[cfg(feature = "logging")]
104                            error!(error = %e, "Failed to read response body");
105                            PreviewError::FetchError(e.to_string())
106                        });
107                    }
108
109                    // For server errors (5xx), retry
110                    if response.status().is_server_error() && attempt < max_retries - 1 {
111                        #[cfg(feature = "logging")]
112                        warn!(
113                            status = %response.status(),
114                            attempt = attempt + 1,
115                            "Server error, retrying after delay"
116                        );
117                        tokio::time::sleep(delay).await;
118                        delay *= 2;
119                        continue;
120                    }
121
122                    // For client errors (4xx except 404) or final attempt, return error
123                    let status = response.status().as_u16();
124                    let message = format!("Server returned status: {}", response.status());
125                    return Err(match status {
126                        400..=499 => PreviewError::ClientError { status, message },
127                        500..=599 => PreviewError::ServerError { status, message },
128                        _ => PreviewError::HttpError { status, message },
129                    });
130                }
131                Err(e) => {
132                    let preview_error = PreviewError::from_reqwest_error(e);
133
134                    // Only retry on server errors or timeouts
135                    let should_retry = matches!(
136                        &preview_error,
137                        PreviewError::ServerError { .. }
138                            | PreviewError::TimeoutError(_)
139                            | PreviewError::ConnectionError(_)
140                    );
141
142                    if should_retry && attempt < max_retries - 1 {
143                        #[cfg(feature = "logging")]
144                        warn!(
145                            error = %preview_error,
146                            attempt = attempt + 1,
147                            "Request error, retrying after delay"
148                        );
149                        tokio::time::sleep(delay).await;
150                        delay *= 2;
151                        continue;
152                    }
153                    #[cfg(feature = "logging")]
154                    error!(error = %preview_error, "Request failed");
155                    return Err(preview_error);
156                }
157            }
158        }
159
160        #[cfg(feature = "logging")]
161        error!("Failed to fetch URL after maximum retries");
162        Err(PreviewError::FetchError("Max retries exceeded".to_string()))
163    }
164
165    #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
166    pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
167        #[cfg(feature = "logging")]
168        debug!(url = %url, "Starting fetch request");
169
170        if is_twitter_url(url) {
171            #[cfg(feature = "logging")]
172            debug!(url = %url, "Detected Twitter URL, using oEmbed API");
173            #[cfg(feature = "twitter")]
174            {
175                let oembed = self.fetch_twitter_oembed(url).await?;
176                Ok(FetchResult::OEmbed(oembed))
177            }
178            #[cfg(not(feature = "twitter"))]
179            {
180                // Fall back to regular HTML fetching
181                self.fetch_html(url).await.map(FetchResult::Html)
182            }
183        } else {
184            #[cfg(feature = "logging")]
185            debug!(url = %url, "Fetching regular webpage");
186            self.fetch_html(url).await.map(FetchResult::Html)
187        }
188    }
189
190    async fn fetch_html(&self, url: &str) -> Result<String, PreviewError> {
191        let response = self.client.get(url).send().await.map_err(|e| {
192            #[cfg(feature = "logging")]
193            error!(error = %e, url = %url, "Failed to send request");
194            PreviewError::from_reqwest_error(e)
195        })?;
196
197        // Check for 404 or other error status codes
198        if response.status() == 404 {
199            return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
200        }
201
202        if !response.status().is_success() {
203            let status = response.status().as_u16();
204            let message = format!("Server returned status: {}", response.status());
205
206            return Err(match status {
207                400..=499 => PreviewError::ClientError { status, message },
208                500..=599 => PreviewError::ServerError { status, message },
209                _ => PreviewError::HttpError { status, message },
210            });
211        }
212
213        let content = response.text().await.map_err(|e| {
214            #[cfg(feature = "logging")]
215            error!(error = %e, url = %url, "Failed to read response body");
216            PreviewError::FetchError(e.to_string())
217        })?;
218
219        #[cfg(feature = "logging")]
220        debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
221        Ok(content)
222    }
223
224    #[cfg(feature = "twitter")]
225    #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
226    async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
227        let oembed_url = format!(
228            "https://publish.twitter.com/oembed?url={tweet_url}&omit_script=1&lang=en"
229        );
230
231        #[cfg(feature = "logging")]
232        debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
233
234        let response = self.client.get(&oembed_url).send().await.map_err(|e| {
235            #[cfg(feature = "logging")]
236            error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
237            // For external services, we wrap the specific error
238            let inner_error = PreviewError::from_reqwest_error(e);
239            match inner_error {
240                PreviewError::DnsError(msg) => PreviewError::ExternalServiceError {
241                    service: "Twitter".to_string(),
242                    message: format!("DNS error: {msg}"),
243                },
244                PreviewError::TimeoutError(msg) => PreviewError::ExternalServiceError {
245                    service: "Twitter".to_string(),
246                    message: format!("Timeout: {msg}"),
247                },
248                PreviewError::ConnectionError(msg) => PreviewError::ExternalServiceError {
249                    service: "Twitter".to_string(),
250                    message: format!("Connection error: {msg}"),
251                },
252                _ => PreviewError::ExternalServiceError {
253                    service: "Twitter".to_string(),
254                    message: inner_error.to_string(),
255                },
256            }
257        })?;
258
259        // Check for 404 or other error status codes
260        if response.status() == 404 {
261            return Err(PreviewError::NotFound(format!(
262                "Twitter/X content not found: {tweet_url}"
263            )));
264        }
265
266        if !response.status().is_success() {
267            let status = response.status().as_u16();
268            let message = format!("Twitter API returned status: {}", response.status());
269
270            // For Twitter, we still wrap it as an external service error but include status info
271            return Err(PreviewError::ExternalServiceError {
272                service: "Twitter".to_string(),
273                message: match status {
274                    400..=499 => format!("Client error ({status}): {message}"),
275                    500..=599 => format!("Server error ({status}): {message}"),
276                    _ => format!("HTTP error ({status}): {message}"),
277                },
278            });
279        }
280
281        let oembed: OEmbedResponse = response.json().await.map_err(|e| {
282            #[cfg(feature = "logging")]
283            error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
284            PreviewError::ExternalServiceError {
285                service: "Twitter".to_string(),
286                message: e.to_string(),
287            }
288        })?;
289
290        #[cfg(feature = "logging")]
291        debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
292        Ok(oembed)
293    }
294}
295
296// for Twitter
297#[cfg(feature = "twitter")]
298impl Fetcher {
299    #[cfg_attr(feature = "logging", instrument(level = "debug"))]
300    pub fn new_twitter_client() -> Self {
301        #[cfg(feature = "logging")]
302        debug!("Creating Twitter-specific fetcher");
303
304        let mut headers = HeaderMap::new();
305
306        headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
307        headers.insert(
308            "Accept",
309            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
310                .parse()
311                .unwrap(),
312        );
313
314        headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
315        headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
316        headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
317        headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
318        headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
319
320        headers.insert("Cache-Control", "no-cache".parse().unwrap());
321        headers.insert("Pragma", "no-cache".parse().unwrap());
322
323        let client = Client::builder()
324            .user_agent(
325                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
326                AppleWebKit/537.36 (KHTML, like Gecko) \
327                Chrome/119.0.0.0 Safari/537.36",
328            )
329            .timeout(Duration::from_secs(30))
330            .redirect(reqwest::redirect::Policy::limited(10))
331            .default_headers(headers)
332            .build()
333            .expect("Failed to create Twitter HTTP client");
334
335        #[cfg(feature = "logging")]
336        debug!("Twitter-specific fetcher created successfully");
337        Self { client }
338    }
339
340    /// Creates a Fetcher with custom configuration
341    /// This method allows users to provide their own configuration options
342    pub fn new_with_config(config: FetcherConfig) -> Self {
343        let mut client_builder = Client::builder()
344            .user_agent(config.user_agent)
345            .timeout(config.timeout);
346
347        // Apply custom headers
348        if let Some(headers) = config.headers {
349            client_builder = client_builder.default_headers(headers);
350        }
351
352        // Apply redirect policy
353        if let Some(redirect_policy) = config.redirect_policy {
354            client_builder = client_builder.redirect(redirect_policy);
355        }
356
357        let client = client_builder
358            .build()
359            .expect("Failed to create HTTP client with custom config");
360
361        Self { client }
362    }
363}
364
365// for GitHub
366#[cfg(feature = "github")]
367impl Fetcher {
368    pub fn new_github_client() -> Self {
369        #[cfg(feature = "logging")]
370        debug!("Creating GitHub-specific client");
371
372        let mut headers = HeaderMap::new();
373        headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
374
375        if let Ok(token) = std::env::var("GITHUB_TOKEN") {
376            #[cfg(feature = "logging")]
377            debug!("Found GitHub token in environment");
378            headers.insert(
379                "Authorization",
380                format!("Bearer {token}").parse().unwrap(),
381            );
382        }
383
384        let client = Client::builder()
385            .user_agent("url_preview/1.0")
386            .default_headers(headers)
387            .timeout(Duration::from_secs(10))
388            .build()
389            .expect("Failed to create GitHub HTTP client");
390
391        Self { client }
392    }
393
394    pub async fn fetch_github_repo(
395        &self,
396        owner: &str,
397        repo: &str,
398    ) -> Result<GitHubRepository, PreviewError> {
399        let url = format!("https://api.github.com/repos/{owner}/{repo}");
400        #[cfg(feature = "logging")]
401        debug!(url = %url, "Fetching GitHub repository information");
402
403        let response = self
404            .client
405            .get(&url)
406            .send()
407            .await
408            .map_err(PreviewError::from_reqwest_error)?;
409
410        // Check for 404 or other error status codes
411        if response.status() == 404 {
412            return Err(PreviewError::NotFound(format!(
413                "GitHub repository {owner}/{repo} not found"
414            )));
415        }
416
417        if !response.status().is_success() {
418            let status = response.status().as_u16();
419            let message = format!("API returned status: {}", response.status());
420
421            return Err(match status {
422                400..=499 => PreviewError::ClientError { status, message },
423                500..=599 => PreviewError::ServerError { status, message },
424                _ => PreviewError::HttpError { status, message },
425            });
426        }
427
428        let repo_info: GitHubRepository = response
429            .json()
430            .await
431            .map_err(|e| PreviewError::ParseError(e.to_string()))?;
432
433        Ok(repo_info)
434    }
435
436    /// A helper function to extract GitHub owner and repo from URL
437    /// Examples:
438    /// - https://github.com/rust-lang/rust -> (rust-lang, rust)
439    /// - https://github.com/rust-lang/rust/issues/123 -> (rust-lang, rust)
440    pub fn parse_github_url(url: &str) -> Option<(String, String)> {
441        let parts: Vec<&str> = url
442            .trim_start_matches("https://")
443            .trim_start_matches("github.com/")
444            .split('/')
445            .collect();
446
447        if parts.len() >= 2 {
448            return Some((parts[0].to_string(), parts[1].to_string()));
449        }
450
451        None
452    }
453
454    /// Extracts Open Graph image from HTML
455    fn extract_og_image(html: &str) -> Option<String> {
456        let document = Html::parse_document(html);
457        let selector = Selector::parse("meta[property='og:image']").ok()?;
458
459        document
460            .select(&selector)
461            .next()
462            .and_then(|elem| elem.value().attr("content"))
463            .map(|s| s.to_string())
464    }
465
466    /// Gets a basic preview using HTML scraping (no API key required)
467    pub async fn fetch_github_basic_preview(
468        &self,
469        owner: &str,
470        repo: &str,
471    ) -> Result<GitHubBasicPreview, PreviewError> {
472        let url = format!("https://github.com/{owner}/{repo}");
473        #[cfg(feature = "logging")]
474        debug!("Fetching basic preview for repository: {}/{}", owner, repo);
475
476        let response = self
477            .client
478            .get(&url)
479            .send()
480            .await
481            .map_err(PreviewError::from_reqwest_error)?;
482
483        // Check for 404 or other error status codes
484        if response.status() == 404 {
485            return Err(PreviewError::NotFound(format!(
486                "GitHub repository {owner}/{repo} not found"
487            )));
488        }
489
490        if !response.status().is_success() {
491            return Err(PreviewError::FetchError(format!(
492                "GitHub returned status: {}",
493                response.status()
494            )));
495        }
496
497        let html = response
498            .text()
499            .await
500            .map_err(|e| PreviewError::FetchError(e.to_string()))?;
501
502        let document = Html::parse_document(&html);
503
504        // Extract title, description, and image
505        let title = Self::extract_meta_content(&document, "meta[property='og:title']");
506        let description = Self::extract_meta_content(&document, "meta[property='og:description']");
507        let image_url = Self::extract_og_image(&html);
508
509        #[cfg(feature = "logging")]
510        {
511            if let Some(ref url) = image_url {
512                debug!("Found GitHub Reop Preview Image URL: {}", url);
513            } else {
514                warn!("Not Found GitHub Reop Preview Image URL");
515            }
516        }
517
518        Ok(GitHubBasicPreview {
519            title,
520            description,
521            image_url,
522        })
523    }
524
525    /// Gets detailed info using the GitHub API
526    pub async fn fetch_github_detailed_info(
527        &self,
528        owner: &str,
529        repo: &str,
530    ) -> Result<GitHubDetailedInfo, PreviewError> {
531        let api_url = format!("https://api.github.com/repos/{owner}/{repo}");
532        #[cfg(feature = "logging")]
533        debug!("Fetching detailed info from GitHub API: {}", api_url);
534
535        let response = self
536            .client
537            .get(&api_url)
538            .send()
539            .await
540            .map_err(PreviewError::from_reqwest_error)?;
541
542        // Check for 404 or other error status codes
543        if response.status() == 404 {
544            return Err(PreviewError::NotFound(format!(
545                "GitHub repository {owner}/{repo} not found"
546            )));
547        }
548
549        if !response.status().is_success() {
550            let status = response.status().as_u16();
551            let message = format!("API returned status: {}", response.status());
552
553            return Err(match status {
554                400..=499 => PreviewError::ClientError { status, message },
555                500..=599 => PreviewError::ServerError { status, message },
556                _ => PreviewError::HttpError { status, message },
557            });
558        }
559
560        let data: serde_json::Value = response
561            .json()
562            .await
563            .map_err(|e| PreviewError::ParseError(e.to_string()))?;
564
565        Ok(GitHubDetailedInfo {
566            full_name: data["full_name"].as_str().unwrap_or("").to_string(),
567            description: data["description"]
568                .as_str()
569                .map(|s| s.to_string())
570                .unwrap_or_default(),
571            stars_count: data["stargazers_count"].as_u64().unwrap_or(0) as u32,
572            forks_count: data["forks_count"].as_u64().unwrap_or(0) as u32,
573            open_issues_count: data["open_issues_count"].as_u64().unwrap_or(0) as u32,
574            language: data["language"].as_str().map(|s| s.to_string()),
575            default_branch: data["default_branch"]
576                .as_str()
577                .unwrap_or("main")
578                .to_string(),
579            topics: data["topics"]
580                .as_array()
581                .map(|arr| {
582                    arr.iter()
583                        .filter_map(|v| v.as_str().map(|s| s.to_string()))
584                        .collect()
585                })
586                .unwrap_or_default(),
587            html_url: data["html_url"].as_str().unwrap_or(&api_url).to_string(),
588            homepage: data["homepage"]
589                .as_str()
590                .filter(|s| !s.is_empty())
591                .map(|s| s.to_string()),
592        })
593    }
594
595    fn extract_meta_content(document: &Html, selector_str: &str) -> Option<String> {
596        let selector = Selector::parse(selector_str).ok()?;
597        document
598            .select(&selector)
599            .next()
600            .and_then(|elem| elem.value().attr("content"))
601            .map(|s| s.to_string())
602    }
603}
604
605// Helper functions that don't depend on features
606impl Fetcher {
607    pub fn extract_twitter_image_from_html(html: &str) -> Option<String> {
608        let document = Html::parse_document(html);
609        let selector = Selector::parse("meta[name='twitter:image']").ok()?;
610
611        if let Some(url) = document
612            .select(&selector)
613            .next()
614            .and_then(|elem| elem.value().attr("content"))
615        {
616            #[cfg(feature = "logging")]
617            debug!("Found Twitter image URL: {}", url);
618            return Some(url.to_string());
619        }
620
621        let og_selector = Selector::parse("meta[property='og:image']").ok()?;
622        document
623            .select(&og_selector)
624            .next()
625            .and_then(|elem| elem.value().attr("content"))
626            .map(|url| {
627                #[cfg(feature = "logging")]
628                debug!("Found Open Graph image URL: {}", url);
629                url.to_string()
630            })
631    }
632}
633
634/// Configuration for the Fetcher
635pub struct FetcherConfig {
636    pub user_agent: String,
637    pub timeout: Duration,
638    pub headers: Option<HeaderMap>,
639    pub redirect_policy: Option<reqwest::redirect::Policy>,
640}
641
642impl Default for FetcherConfig {
643    fn default() -> Self {
644        Self {
645            user_agent: "url_preview/0.1.0".to_string(),
646            timeout: Duration::from_secs(10),
647            headers: None,
648            redirect_policy: None,
649        }
650    }
651}