url_preview/
fetcher.rs

1use super::is_twitter_url;
2#[cfg(feature = "github")]
3use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
4use crate::PreviewError;
5use reqwest::{header::HeaderMap, Client};
6use scraper::{Html, Selector};
7use serde::Deserialize;
8use std::time::Duration;
9#[cfg(feature = "logging")]
10use tracing::{debug, error, instrument, warn};
11
12#[derive(Debug, Clone, Deserialize)]
13pub struct OEmbedResponse {
14    pub html: String,
15    #[serde(default)]
16    pub author_name: String,
17    #[serde(default)]
18    pub author_url: String,
19    pub provider_name: String,
20    pub provider_url: String,
21}
22
23#[derive(Clone)]
24pub struct Fetcher {
25    client: Client,
26}
27
28#[derive(Debug, Clone)]
29pub enum FetchResult {
30    Html(String),
31    OEmbed(OEmbedResponse),
32}
33
34impl Default for Fetcher {
35    fn default() -> Self {
36        Self::new()
37    }
38}
39
40impl Fetcher {
41    pub fn new() -> Self {
42        let user_agent = "url_preview/0.1.0";
43        let timeout = Duration::from_secs(10);
44        #[cfg(feature = "logging")]
45        debug!("Fetcher initialized with default configuration");
46
47        Self::new_with_custom_config(timeout, user_agent)
48    }
49
50    pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
51        let client = Client::builder()
52            .timeout(timeout)
53            .user_agent(user_agent)
54            .pool_max_idle_per_host(10)
55            .build()
56            .unwrap_or_else(|e| {
57                #[cfg(feature = "logging")]
58                error!(error = %e, "Failed to create HTTP client");
59                panic!("Failed to initialize HTTP client: {}", e);
60            });
61        Fetcher { client }
62    }
63
64    pub fn with_client(client: Client) -> Self {
65        Self { client }
66    }
67
68    pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
69        let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
70        let results = futures::future::join_all(futures).await;
71
72        let mut responses = Vec::new();
73        for result in results {
74            match result {
75                Ok(response) => responses.push(response),
76                Err(e) => return Err(e),
77            }
78        }
79
80        Ok(responses)
81    }
82
83    #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
84    pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
85        let max_retries = 3;
86        let mut delay = Duration::from_millis(1000);
87
88        for attempt in 0..max_retries {
89            #[cfg(feature = "logging")]
90            debug!(attempt = attempt + 1, "Attempting to fetch URL");
91
92            match self.client.get(url).send().await {
93                Ok(response) => {
94                    // Check for 404 first
95                    if response.status() == 404 {
96                        return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
97                    }
98
99                    if response.status().is_success() {
100                        #[cfg(feature = "logging")]
101                        debug!(url = %url, "Successfully fetched URL");
102                        return response.text().await.map_err(|e| {
103                            #[cfg(feature = "logging")]
104                            error!(error = %e, "Failed to read response body");
105                            PreviewError::FetchError(e.to_string())
106                        });
107                    }
108
109                    // For server errors (5xx), retry
110                    if response.status().is_server_error() && attempt < max_retries - 1 {
111                        #[cfg(feature = "logging")]
112                        warn!(
113                            status = %response.status(),
114                            attempt = attempt + 1,
115                            "Server error, retrying after delay"
116                        );
117                        tokio::time::sleep(delay).await;
118                        delay *= 2;
119                        continue;
120                    }
121
122                    // For client errors (4xx except 404) or final attempt, return error
123                    let status = response.status().as_u16();
124                    let message = format!("Server returned status: {}", response.status());
125                    return Err(match status {
126                        400..=499 => PreviewError::ClientError { status, message },
127                        500..=599 => PreviewError::ServerError { status, message },
128                        _ => PreviewError::HttpError { status, message },
129                    });
130                }
131                Err(e) => {
132                    let preview_error = PreviewError::from_reqwest_error(e);
133
134                    // Only retry on server errors or timeouts
135                    let should_retry = matches!(
136                        &preview_error,
137                        PreviewError::ServerError { .. }
138                            | PreviewError::TimeoutError(_)
139                            | PreviewError::ConnectionError(_)
140                    );
141
142                    if should_retry && attempt < max_retries - 1 {
143                        #[cfg(feature = "logging")]
144                        warn!(
145                            error = %preview_error,
146                            attempt = attempt + 1,
147                            "Request error, retrying after delay"
148                        );
149                        tokio::time::sleep(delay).await;
150                        delay *= 2;
151                        continue;
152                    }
153                    #[cfg(feature = "logging")]
154                    error!(error = %preview_error, "Request failed");
155                    return Err(preview_error);
156                }
157            }
158        }
159
160        #[cfg(feature = "logging")]
161        error!("Failed to fetch URL after maximum retries");
162        Err(PreviewError::FetchError("Max retries exceeded".to_string()))
163    }
164
165    #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
166    pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
167        #[cfg(feature = "logging")]
168        debug!(url = %url, "Starting fetch request");
169
170        if is_twitter_url(url) {
171            #[cfg(feature = "logging")]
172            debug!(url = %url, "Detected Twitter URL, using oEmbed API");
173            #[cfg(feature = "twitter")]
174            {
175                let oembed = self.fetch_twitter_oembed(url).await?;
176                Ok(FetchResult::OEmbed(oembed))
177            }
178            #[cfg(not(feature = "twitter"))]
179            {
180                // Fall back to regular HTML fetching
181                self.fetch_html(url).await.map(FetchResult::Html)
182            }
183        } else {
184            #[cfg(feature = "logging")]
185            debug!(url = %url, "Fetching regular webpage");
186            self.fetch_html(url).await.map(FetchResult::Html)
187        }
188    }
189
190    async fn fetch_html(&self, url: &str) -> Result<String, PreviewError> {
191        let response = self.client.get(url).send().await.map_err(|e| {
192            #[cfg(feature = "logging")]
193            error!(error = %e, url = %url, "Failed to send request");
194            PreviewError::from_reqwest_error(e)
195        })?;
196
197        // Check for 404 or other error status codes
198        if response.status() == 404 {
199            return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
200        }
201
202        if !response.status().is_success() {
203            let status = response.status().as_u16();
204            let message = format!("Server returned status: {}", response.status());
205
206            return Err(match status {
207                400..=499 => PreviewError::ClientError { status, message },
208                500..=599 => PreviewError::ServerError { status, message },
209                _ => PreviewError::HttpError { status, message },
210            });
211        }
212
213        let content = response.text().await.map_err(|e| {
214            #[cfg(feature = "logging")]
215            error!(error = %e, url = %url, "Failed to read response body");
216            PreviewError::FetchError(e.to_string())
217        })?;
218
219        #[cfg(feature = "logging")]
220        debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
221        Ok(content)
222    }
223
224    #[cfg(feature = "twitter")]
225    #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
226    async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
227        let oembed_url = format!(
228            "https://publish.twitter.com/oembed?url={}&omit_script=1&lang=en",
229            tweet_url
230        );
231
232        #[cfg(feature = "logging")]
233        debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
234
235        let response = self.client.get(&oembed_url).send().await.map_err(|e| {
236            #[cfg(feature = "logging")]
237            error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
238            // For external services, we wrap the specific error
239            let inner_error = PreviewError::from_reqwest_error(e);
240            match inner_error {
241                PreviewError::DnsError(msg) => PreviewError::ExternalServiceError {
242                    service: "Twitter".to_string(),
243                    message: format!("DNS error: {}", msg),
244                },
245                PreviewError::TimeoutError(msg) => PreviewError::ExternalServiceError {
246                    service: "Twitter".to_string(),
247                    message: format!("Timeout: {}", msg),
248                },
249                PreviewError::ConnectionError(msg) => PreviewError::ExternalServiceError {
250                    service: "Twitter".to_string(),
251                    message: format!("Connection error: {}", msg),
252                },
253                _ => PreviewError::ExternalServiceError {
254                    service: "Twitter".to_string(),
255                    message: inner_error.to_string(),
256                },
257            }
258        })?;
259
260        // Check for 404 or other error status codes
261        if response.status() == 404 {
262            return Err(PreviewError::NotFound(format!(
263                "Twitter/X content not found: {tweet_url}"
264            )));
265        }
266
267        if !response.status().is_success() {
268            let status = response.status().as_u16();
269            let message = format!("Twitter API returned status: {}", response.status());
270
271            // For Twitter, we still wrap it as an external service error but include status info
272            return Err(PreviewError::ExternalServiceError {
273                service: "Twitter".to_string(),
274                message: match status {
275                    400..=499 => format!("Client error ({}): {}", status, message),
276                    500..=599 => format!("Server error ({}): {}", status, message),
277                    _ => format!("HTTP error ({}): {}", status, message),
278                },
279            });
280        }
281
282        let oembed: OEmbedResponse = response.json().await.map_err(|e| {
283            #[cfg(feature = "logging")]
284            error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
285            PreviewError::ExternalServiceError {
286                service: "Twitter".to_string(),
287                message: e.to_string(),
288            }
289        })?;
290
291        #[cfg(feature = "logging")]
292        debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
293        Ok(oembed)
294    }
295}
296
297// for Twitter
298#[cfg(feature = "twitter")]
299impl Fetcher {
300    #[cfg_attr(feature = "logging", instrument(level = "debug"))]
301    pub fn new_twitter_client() -> Self {
302        #[cfg(feature = "logging")]
303        debug!("Creating Twitter-specific fetcher");
304
305        let mut headers = HeaderMap::new();
306
307        headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
308        headers.insert(
309            "Accept",
310            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
311                .parse()
312                .unwrap(),
313        );
314
315        headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
316        headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
317        headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
318        headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
319        headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
320
321        headers.insert("Cache-Control", "no-cache".parse().unwrap());
322        headers.insert("Pragma", "no-cache".parse().unwrap());
323
324        let client = Client::builder()
325            .user_agent(
326                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
327                AppleWebKit/537.36 (KHTML, like Gecko) \
328                Chrome/119.0.0.0 Safari/537.36",
329            )
330            .timeout(Duration::from_secs(30))
331            .redirect(reqwest::redirect::Policy::limited(10))
332            .default_headers(headers)
333            .build()
334            .expect("Failed to create Twitter HTTP client");
335
336        #[cfg(feature = "logging")]
337        debug!("Twitter-specific fetcher created successfully");
338        Self { client }
339    }
340
341    /// Creates a Fetcher with custom configuration
342    /// This method allows users to provide their own configuration options
343    pub fn new_with_config(config: FetcherConfig) -> Self {
344        let mut client_builder = Client::builder()
345            .user_agent(config.user_agent)
346            .timeout(config.timeout);
347
348        // Apply custom headers
349        if let Some(headers) = config.headers {
350            client_builder = client_builder.default_headers(headers);
351        }
352
353        // Apply redirect policy
354        if let Some(redirect_policy) = config.redirect_policy {
355            client_builder = client_builder.redirect(redirect_policy);
356        }
357
358        let client = client_builder
359            .build()
360            .expect("Failed to create HTTP client with custom config");
361
362        Self { client }
363    }
364}
365
366// for GitHub
367#[cfg(feature = "github")]
368impl Fetcher {
369    pub fn new_github_client() -> Self {
370        #[cfg(feature = "logging")]
371        debug!("Creating GitHub-specific client");
372
373        let mut headers = HeaderMap::new();
374        headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
375
376        if let Ok(token) = std::env::var("GITHUB_TOKEN") {
377            #[cfg(feature = "logging")]
378            debug!("Found GitHub token in environment");
379            headers.insert(
380                "Authorization",
381                format!("Bearer {}", token).parse().unwrap(),
382            );
383        }
384
385        let client = Client::builder()
386            .user_agent("url_preview/1.0")
387            .default_headers(headers)
388            .timeout(Duration::from_secs(10))
389            .build()
390            .expect("Failed to create GitHub HTTP client");
391
392        Self { client }
393    }
394
395    pub async fn fetch_github_repo(
396        &self,
397        owner: &str,
398        repo: &str,
399    ) -> Result<GitHubRepository, PreviewError> {
400        let url = format!("https://api.github.com/repos/{}/{}", owner, repo);
401        #[cfg(feature = "logging")]
402        debug!(url = %url, "Fetching GitHub repository information");
403
404        let response = self
405            .client
406            .get(&url)
407            .send()
408            .await
409            .map_err(PreviewError::from_reqwest_error)?;
410
411        // Check for 404 or other error status codes
412        if response.status() == 404 {
413            return Err(PreviewError::NotFound(format!(
414                "GitHub repository {owner}/{repo} not found"
415            )));
416        }
417
418        if !response.status().is_success() {
419            let status = response.status().as_u16();
420            let message = format!("API returned status: {}", response.status());
421
422            return Err(match status {
423                400..=499 => PreviewError::ClientError { status, message },
424                500..=599 => PreviewError::ServerError { status, message },
425                _ => PreviewError::HttpError { status, message },
426            });
427        }
428
429        let repo_info: GitHubRepository = response
430            .json()
431            .await
432            .map_err(|e| PreviewError::ParseError(e.to_string()))?;
433
434        Ok(repo_info)
435    }
436
437    /// A helper function to extract GitHub owner and repo from URL
438    /// Examples:
439    /// - https://github.com/rust-lang/rust -> (rust-lang, rust)
440    /// - https://github.com/rust-lang/rust/issues/123 -> (rust-lang, rust)
441    pub fn parse_github_url(url: &str) -> Option<(String, String)> {
442        let parts: Vec<&str> = url
443            .trim_start_matches("https://")
444            .trim_start_matches("github.com/")
445            .split('/')
446            .collect();
447
448        if parts.len() >= 2 {
449            return Some((parts[0].to_string(), parts[1].to_string()));
450        }
451
452        None
453    }
454
455    /// Extracts Open Graph image from HTML
456    fn extract_og_image(html: &str) -> Option<String> {
457        let document = Html::parse_document(html);
458        let selector = Selector::parse("meta[property='og:image']").ok()?;
459
460        document
461            .select(&selector)
462            .next()
463            .and_then(|elem| elem.value().attr("content"))
464            .map(|s| s.to_string())
465    }
466
467    /// Gets a basic preview using HTML scraping (no API key required)
468    pub async fn fetch_github_basic_preview(
469        &self,
470        owner: &str,
471        repo: &str,
472    ) -> Result<GitHubBasicPreview, PreviewError> {
473        let url = format!("https://github.com/{}/{}", owner, repo);
474        #[cfg(feature = "logging")]
475        debug!("Fetching basic preview for repository: {}/{}", owner, repo);
476
477        let response = self
478            .client
479            .get(&url)
480            .send()
481            .await
482            .map_err(PreviewError::from_reqwest_error)?;
483
484        // Check for 404 or other error status codes
485        if response.status() == 404 {
486            return Err(PreviewError::NotFound(format!(
487                "GitHub repository {owner}/{repo} not found"
488            )));
489        }
490
491        if !response.status().is_success() {
492            return Err(PreviewError::FetchError(format!(
493                "GitHub returned status: {}",
494                response.status()
495            )));
496        }
497
498        let html = response
499            .text()
500            .await
501            .map_err(|e| PreviewError::FetchError(e.to_string()))?;
502
503        let document = Html::parse_document(&html);
504
505        // Extract title, description, and image
506        let title = Self::extract_meta_content(&document, "meta[property='og:title']");
507        let description = Self::extract_meta_content(&document, "meta[property='og:description']");
508        let image_url = Self::extract_og_image(&html);
509
510        #[cfg(feature = "logging")]
511        {
512            if let Some(ref url) = image_url {
513                debug!("Found GitHub Reop Preview Image URL: {}", url);
514            } else {
515                warn!("Not Found GitHub Reop Preview Image URL");
516            }
517        }
518
519        Ok(GitHubBasicPreview {
520            title,
521            description,
522            image_url,
523        })
524    }
525
526    /// Gets detailed info using the GitHub API
527    pub async fn fetch_github_detailed_info(
528        &self,
529        owner: &str,
530        repo: &str,
531    ) -> Result<GitHubDetailedInfo, PreviewError> {
532        let api_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
533        #[cfg(feature = "logging")]
534        debug!("Fetching detailed info from GitHub API: {}", api_url);
535
536        let response = self
537            .client
538            .get(&api_url)
539            .send()
540            .await
541            .map_err(PreviewError::from_reqwest_error)?;
542
543        // Check for 404 or other error status codes
544        if response.status() == 404 {
545            return Err(PreviewError::NotFound(format!(
546                "GitHub repository {owner}/{repo} not found"
547            )));
548        }
549
550        if !response.status().is_success() {
551            let status = response.status().as_u16();
552            let message = format!("API returned status: {}", response.status());
553
554            return Err(match status {
555                400..=499 => PreviewError::ClientError { status, message },
556                500..=599 => PreviewError::ServerError { status, message },
557                _ => PreviewError::HttpError { status, message },
558            });
559        }
560
561        let data: serde_json::Value = response
562            .json()
563            .await
564            .map_err(|e| PreviewError::ParseError(e.to_string()))?;
565
566        Ok(GitHubDetailedInfo {
567            full_name: data["full_name"].as_str().unwrap_or("").to_string(),
568            description: data["description"]
569                .as_str()
570                .map(|s| s.to_string())
571                .unwrap_or_default(),
572            stars_count: data["stargazers_count"].as_u64().unwrap_or(0) as u32,
573            forks_count: data["forks_count"].as_u64().unwrap_or(0) as u32,
574            open_issues_count: data["open_issues_count"].as_u64().unwrap_or(0) as u32,
575            language: data["language"].as_str().map(|s| s.to_string()),
576            default_branch: data["default_branch"]
577                .as_str()
578                .unwrap_or("main")
579                .to_string(),
580            topics: data["topics"]
581                .as_array()
582                .map(|arr| {
583                    arr.iter()
584                        .filter_map(|v| v.as_str().map(|s| s.to_string()))
585                        .collect()
586                })
587                .unwrap_or_default(),
588            html_url: data["html_url"].as_str().unwrap_or(&api_url).to_string(),
589            homepage: data["homepage"]
590                .as_str()
591                .filter(|s| !s.is_empty())
592                .map(|s| s.to_string()),
593        })
594    }
595
596    fn extract_meta_content(document: &Html, selector_str: &str) -> Option<String> {
597        let selector = Selector::parse(selector_str).ok()?;
598        document
599            .select(&selector)
600            .next()
601            .and_then(|elem| elem.value().attr("content"))
602            .map(|s| s.to_string())
603    }
604}
605
606// Helper functions that don't depend on features
607impl Fetcher {
608    pub fn extract_twitter_image_from_html(html: &str) -> Option<String> {
609        let document = Html::parse_document(html);
610        let selector = Selector::parse("meta[name='twitter:image']").ok()?;
611
612        if let Some(url) = document
613            .select(&selector)
614            .next()
615            .and_then(|elem| elem.value().attr("content"))
616        {
617            #[cfg(feature = "logging")]
618            debug!("Found Twitter image URL: {}", url);
619            return Some(url.to_string());
620        }
621
622        let og_selector = Selector::parse("meta[property='og:image']").ok()?;
623        document
624            .select(&og_selector)
625            .next()
626            .and_then(|elem| elem.value().attr("content"))
627            .map(|url| {
628                #[cfg(feature = "logging")]
629                debug!("Found Open Graph image URL: {}", url);
630                url.to_string()
631            })
632    }
633}
634
635/// Configuration for the Fetcher
636pub struct FetcherConfig {
637    pub user_agent: String,
638    pub timeout: Duration,
639    pub headers: Option<HeaderMap>,
640    pub redirect_policy: Option<reqwest::redirect::Policy>,
641}
642
643impl Default for FetcherConfig {
644    fn default() -> Self {
645        Self {
646            user_agent: "url_preview/0.1.0".to_string(),
647            timeout: Duration::from_secs(10),
648            headers: None,
649            redirect_policy: None,
650        }
651    }
652}