url_preview/
fetcher.rs

1use super::is_twitter_url;
2use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
3use crate::PreviewError;
4use reqwest::{header::HeaderMap, Client};
5use scraper::{Html, Selector};
6use serde::Deserialize;
7use std::time::Duration;
8use tracing::{debug, error, instrument, warn};
9
10#[derive(Debug, Clone, Deserialize)]
11pub struct OEmbedResponse {
12    pub html: String,
13    #[serde(default)]
14    pub author_name: String,
15    #[serde(default)]
16    pub author_url: String,
17    pub provider_name: String,
18    pub provider_url: String,
19}
20
21#[derive(Clone)]
22pub struct Fetcher {
23    client: Client,
24}
25
26#[derive(Debug, Clone)]
27pub enum FetchResult {
28    Html(String),
29    OEmbed(OEmbedResponse),
30}
31
32impl Fetcher {
33    pub fn new() -> Self {
34        let user_agent = "url_preview/0.1.0";
35        let timeout = Duration::from_secs(10);
36        debug!("Fetcher initialized with default configuration");
37
38        Self::new_with_custom_config(timeout, user_agent)
39    }
40
41    pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
42        let client = Client::builder()
43            .timeout(timeout)
44            .user_agent(user_agent)
45            .pool_max_idle_per_host(10)
46            .build()
47            .unwrap_or_else(|e| {
48                error!(error = %e, "Failed to create HTTP client");
49                panic!("Failed to initialize HTTP client: {}", e);
50            });
51        Fetcher { client }
52    }
53
54    pub fn with_client(client: Client) -> Self {
55        Self { client }
56    }
57
58    pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
59        let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
60        let results = futures::future::join_all(futures).await;
61
62        let mut responses = Vec::new();
63        for result in results {
64            match result {
65                Ok(response) => responses.push(response),
66                Err(e) => return Err(e),
67            }
68        }
69
70        Ok(responses)
71    }
72
73
74    #[instrument(level = "debug", skip(self), err)]
75    pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
76        let max_retries = 3;
77        let mut delay = Duration::from_millis(1000);
78
79        for attempt in 0..max_retries {
80            debug!(attempt = attempt + 1, "Attempting to fetch URL");
81
82            match self.client.get(url).send().await {
83                Ok(response) => {
84                    if response.status().is_success() {
85                        debug!(url = %url, "Successfully fetched URL");
86                        return response.text().await.map_err(|e| {
87                            error!(error = %e, "Failed to read response body");
88                            PreviewError::FetchError(e.to_string())
89                        });
90                    }
91
92                    if attempt < max_retries - 1 {
93                        warn!(
94                            status = %response.status(),
95                            attempt = attempt + 1,
96                            "Request failed, retrying after delay"
97                        );
98                        tokio::time::sleep(delay).await;
99                        delay *= 2;
100                        continue;
101                    }
102                }
103                Err(e) => {
104                    if attempt < max_retries - 1 {
105                        warn!(
106                            error = %e,
107                            attempt = attempt + 1,
108                            "Request error, retrying after delay"
109                        );
110                        tokio::time::sleep(delay).await;
111                        delay *= 2;
112                        continue;
113                    }
114                    error!(error = %e, "Max retries exceeded");
115                    return Err(PreviewError::FetchError(e.to_string()));
116                }
117            }
118        }
119
120        error!("Failed to fetch URL after maximum retries");
121        Err(PreviewError::FetchError("Max retries exceeded".to_string()))
122    }
123
124    #[instrument(level = "debug", skip(self), err)]
125    pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
126        debug!(url = %url, "Starting fetch request");
127
128        if is_twitter_url(url) {
129            debug!(url = %url, "Detected Twitter URL, using oEmbed API");
130            let oembed = self.fetch_twitter_oembed(url).await?;
131            Ok(FetchResult::OEmbed(oembed))
132        } else {
133            debug!(url = %url, "Fetching regular webpage");
134            let content = self
135                .client
136                .get(url)
137                .send()
138                .await
139                .map_err(|e| {
140                    error!(error = %e, url = %url, "Failed to send request");
141                    PreviewError::FetchError(e.to_string())
142                })?
143                .text()
144                .await
145                .map_err(|e| {
146                    error!(error = %e, url = %url, "Failed to read response body");
147                    PreviewError::FetchError(e.to_string())
148                })?;
149
150            debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
151            Ok(FetchResult::Html(content))
152        }
153    }
154
155    #[instrument(level = "debug", skip(self), err)]
156    async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
157        let oembed_url = format!(
158            "https://publish.twitter.com/oembed?url={}&omit_script=1&lang=en",
159            tweet_url
160        );
161
162        debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
163
164        let response = self.client.get(&oembed_url).send().await.map_err(|e| {
165            error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
166            PreviewError::ExternalServiceError {
167                service: "Twitter".to_string(),
168                message: e.to_string(),
169            }
170        })?;
171
172        let oembed: OEmbedResponse = response.json().await.map_err(|e| {
173            error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
174            PreviewError::ExternalServiceError {
175                service: "Twitter".to_string(),
176                message: e.to_string(),
177            }
178        })?;
179
180        debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
181        Ok(oembed)
182    }
183}
184
185// for Twitter
186impl Fetcher {
187
188    #[instrument(level = "debug")]
189    pub fn new_twitter_client() -> Self {
190        debug!("Creating Twitter-specific fetcher");
191
192        let mut headers = HeaderMap::new();
193
194        headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
195        headers.insert(
196            "Accept",
197            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
198                .parse()
199                .unwrap(),
200        );
201
202        headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
203        headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
204        headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
205        headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
206        headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
207
208        headers.insert("Cache-Control", "no-cache".parse().unwrap());
209        headers.insert("Pragma", "no-cache".parse().unwrap());
210
211        let client = Client::builder()
212            .user_agent(
213                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
214                AppleWebKit/537.36 (KHTML, like Gecko) \
215                Chrome/119.0.0.0 Safari/537.36",
216            )
217            .timeout(Duration::from_secs(30))
218            .redirect(reqwest::redirect::Policy::limited(10))
219            .default_headers(headers)
220            .build()
221            .expect("Failed to create Twitter HTTP client");
222
223        debug!("Twitter-specific fetcher created successfully");
224        Self { client }
225    }
226
227    /// Creates a Fetcher with custom configuration
228    /// This method allows users to provide their own configuration options
229    pub fn new_with_config(config: FetcherConfig) -> Self {
230        let mut client_builder = Client::builder()
231            .user_agent(config.user_agent)
232            .timeout(config.timeout);
233
234        // Apply custom headers
235        if let Some(headers) = config.headers {
236            client_builder = client_builder.default_headers(headers);
237        }
238
239        // Apply redirect policy
240        if let Some(redirect_policy) = config.redirect_policy {
241            client_builder = client_builder.redirect(redirect_policy);
242        }
243
244        let client = client_builder
245            .build()
246            .expect("Failed to create HTTP client with custom config");
247
248        Self { client }
249    }
250}
251
252// for GitHub
253impl Fetcher {
254
255    pub fn new_github_client() -> Self {
256        debug!("Creating GitHub-specific client");
257
258        let mut headers = HeaderMap::new();
259        headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
260
261        if let Ok(token) = std::env::var("GITHUB_TOKEN") {
262            debug!("Found GitHub token in environment");
263            headers.insert(
264                "Authorization",
265                format!("Bearer {}", token).parse().unwrap(),
266            );
267        }
268
269        let client = Client::builder()
270            .user_agent("url_preview/1.0")
271            .default_headers(headers)
272            .timeout(Duration::from_secs(10))
273            .build()
274            .unwrap();
275
276        Self { client }
277    }
278
279    pub async fn fetch_github_repo(
280        &self,
281        owner: &str,
282        repo: &str,
283    ) -> Result<GitHubRepository, PreviewError> {
284        let url = format!("https://api.github.com/repos/{}/{}", owner, repo);
285        debug!(url = %url, "Fetching GitHub repository information");
286
287        let response =
288            self.client.get(&url).send().await.map_err(|e| {
289                PreviewError::FetchError(format!("GitHub API request failed: {}", e))
290            })?;
291
292        if !response.status().is_success() {
293            return Err(PreviewError::FetchError(format!(
294                "GitHub API returned status: {}",
295                response.status()
296            )));
297        }
298
299        response.json::<GitHubRepository>().await.map_err(|e| {
300            PreviewError::ExtractError(format!("Failed to parse GitHub response: {}", e))
301        })
302    }
303}
304
305/// Creates a fetcher with Twitter-specific configurations.
306///
307/// # Examples
308/// ```ignore
309/// let fetcher = Fetcher::new();
310///
311/// // Using Twitter-specific configuration
312/// let twitter_fetcher = Fetcher::new_twitter_client();
313///
314/// // Using custom configuration
315/// let custom_fetcher = Fetcher::new_with_config(FetcherConfig {
316///     user_agent: "my-custom-agent/1.0".to_string(),
317///     timeout: Duration::from_secs(20),
318///     headers: Some(my_custom_headers),
319///     redirect_policy: Some(my_redirect_policy),
320/// });
321/// ```
322pub struct FetcherConfig {
323    pub user_agent: String,
324    pub timeout: Duration,
325    pub headers: Option<HeaderMap>,
326    pub redirect_policy: Option<reqwest::redirect::Policy>,
327}
328
329impl Default for FetcherConfig {
330    fn default() -> Self {
331        Self {
332            user_agent: "url_preview/0.1.0".to_string(),
333            timeout: Duration::from_secs(10),
334            headers: None,
335            redirect_policy: None,
336        }
337    }
338}
339
340// for GitHub
341impl Fetcher {
342    pub async fn fetch_github_basic_preview(
343        &self,
344        owner: &str,
345        repo: &str,
346    ) -> Result<GitHubBasicPreview, PreviewError> {
347        let url = format!("https://github.com/{}/{}", owner, repo);
348        debug!("Fetching basic preview for repository: {}/{}", owner, repo);
349
350        let response =
351            self.client.get(&url).send().await.map_err(|e| {
352                PreviewError::FetchError(format!("Failed to fetch GitHub page: {}", e))
353            })?;
354
355        let html = response.text().await.map_err(|e| {
356            PreviewError::FetchError(format!("Failed to read response body: {}", e))
357        })?;
358
359        let document = Html::parse_document(&html);
360
361        let title = self.extract_title(&document)?;
362        let description = self.extract_description(&document);
363        let image_url = self.extract_og_image(&document);
364
365        if let Some(ref url) = image_url {
366            debug!("Found GitHub Reop Preview Image URL: {}", url);
367        } else {
368            warn!("Not Found GitHub Reop Preview Image URL");
369        }
370
371        Ok(GitHubBasicPreview {
372            title,
373            description,
374            image_url,
375        })
376    }
377
378    pub async fn fetch_github_detailed_info(
379        &self,
380        owner: &str,
381        repo: &str,
382    ) -> Result<GitHubDetailedInfo, PreviewError> {
383        let api_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
384        debug!("Fetching detailed info from GitHub API: {}", api_url);
385
386        let response = self
387            .client
388            .get(&api_url)
389            .header("Accept", "application/vnd.github.v3+json")
390            .send()
391            .await
392            .map_err(|e| PreviewError::FetchError(format!("GitHub API request failed: {}", e)))?;
393
394        let repo_data: serde_json::Value = response.json().await.map_err(|e| {
395            PreviewError::ExtractError(format!("Failed to parse GitHub API response: {}", e))
396        })?;
397
398        let contributors_url = format!("{}/contributors?per_page=1", api_url);
399        let contributors_count = self.get_contributors_count(&contributors_url).await?;
400
401        Ok(GitHubDetailedInfo {
402            stars_count: repo_data["stargazers_count"].as_u64().unwrap_or(0) as u32,
403            forks_count: repo_data["forks_count"].as_u64().unwrap_or(0) as u32,
404            contributors_count,
405            issues_count: repo_data["open_issues_count"].as_u64().unwrap_or(0) as u32,
406            discussions_count: repo_data["discussions_count"].as_u64().unwrap_or(0) as u32,
407            primary_language: repo_data["language"].as_str().map(String::from),
408        })
409    }
410
411    fn extract_title(&self, document: &Html) -> Result<String, PreviewError> {
412        let og_title_selector = Selector::parse("meta[property='og:title']")
413            .map_err(|e| PreviewError::ExtractError(format!("Invalid selector: {}", e)))?;
414
415        document
416            .select(&og_title_selector)
417            .next()
418            .and_then(|el| el.value().attr("content"))
419            .map(String::from)
420            .ok_or_else(|| PreviewError::ExtractError("Title not found".into()))
421    }
422
423    fn extract_description(&self, document: &Html) -> Option<String> {
424        let selector = Selector::parse("meta[property='og:description']").ok()?;
425        document
426            .select(&selector)
427            .next()
428            .and_then(|el| el.value().attr("content"))
429            .map(String::from)
430    }
431
432    fn extract_og_image(&self, document: &Html) -> Option<String> {
433        let twitter_image_selector = Selector::parse("meta[name='twitter:image']").ok()?;
434
435        if let Some(url) = document
436            .select(&twitter_image_selector)
437            .next()
438            .and_then(|el| el.value().attr("content"))
439        {
440            debug!("Found Twitter image URL: {}", url);
441            return Some(url.to_string());
442        }
443
444        // if not found twitter:image,back to find og:image
445        let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
446
447        document
448            .select(&og_image_selector)
449            .next()
450            .and_then(|el| el.value().attr("content"))
451            .map(|url| {
452                debug!("Found Open Graph image URL: {}", url);
453                url.to_string()
454            })
455    }
456
457    async fn get_contributors_count(&self, url: &str) -> Result<u32, PreviewError> {
458        let response = self.client.get(url).send().await.map_err(|e| {
459            PreviewError::FetchError(format!("Failed to fetch contributors: {}", e))
460        })?;
461
462        if let Some(link_header) = response.headers().get("Link") {
463            if let Ok(link_str) = link_header.to_str() {
464                if let Some(last_page) = parse_github_link_header(link_str) {
465                    return Ok(last_page);
466                }
467            }
468        }
469
470        Ok(1)
471    }
472}
473
474fn parse_github_link_header(link_str: &str) -> Option<u32> {
475    // Link Header info:
476    // <https://api.github.com/repos/owner/repo/contributors?page=2>; rel="next",
477    // <https://api.github.com/repos/owner/repo/contributors?page=817>; rel="last"
478
479    for link in link_str.split(',') {
480        if link.contains("rel=\"last\"") {
481            if let Some(page) = link
482                .split(';')
483                .next()
484                .and_then(|url| Some(url.trim_matches(|c| c == '<' || c == '>' || c == ' ')))
485                .and_then(|url| url.split('=').last())
486                .and_then(|page| page.parse().ok())
487            {
488                return Some(page);
489            }
490        }
491    }
492    None
493}