1use super::is_twitter_url;
2use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
3use crate::PreviewError;
4use reqwest::{header::HeaderMap, Client};
5use scraper::{Html, Selector};
6use serde::Deserialize;
7use std::time::Duration;
8use tracing::{debug, error, instrument, warn};
9
10#[derive(Debug, Clone, Deserialize)]
11pub struct OEmbedResponse {
12 pub html: String,
13 #[serde(default)]
14 pub author_name: String,
15 #[serde(default)]
16 pub author_url: String,
17 pub provider_name: String,
18 pub provider_url: String,
19}
20
21#[derive(Clone)]
22pub struct Fetcher {
23 client: Client,
24}
25
26#[derive(Debug, Clone)]
27pub enum FetchResult {
28 Html(String),
29 OEmbed(OEmbedResponse),
30}
31
32impl Default for Fetcher {
33 fn default() -> Self {
34 Self::new()
35 }
36}
37
38impl Fetcher {
39 pub fn new() -> Self {
40 let user_agent = "url_preview/0.1.0";
41 let timeout = Duration::from_secs(10);
42 debug!("Fetcher initialized with default configuration");
43
44 Self::new_with_custom_config(timeout, user_agent)
45 }
46
47 pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
48 let client = Client::builder()
49 .timeout(timeout)
50 .user_agent(user_agent)
51 .pool_max_idle_per_host(10)
52 .build()
53 .unwrap_or_else(|e| {
54 error!(error = %e, "Failed to create HTTP client");
55 panic!("Failed to initialize HTTP client: {}", e);
56 });
57 Fetcher { client }
58 }
59
60 pub fn with_client(client: Client) -> Self {
61 Self { client }
62 }
63
64 pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
65 let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
66 let results = futures::future::join_all(futures).await;
67
68 let mut responses = Vec::new();
69 for result in results {
70 match result {
71 Ok(response) => responses.push(response),
72 Err(e) => return Err(e),
73 }
74 }
75
76 Ok(responses)
77 }
78
79 #[instrument(level = "debug", skip(self), err)]
80 pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
81 let max_retries = 3;
82 let mut delay = Duration::from_millis(1000);
83
84 for attempt in 0..max_retries {
85 debug!(attempt = attempt + 1, "Attempting to fetch URL");
86
87 match self.client.get(url).send().await {
88 Ok(response) => {
89 if response.status().is_success() {
90 debug!(url = %url, "Successfully fetched URL");
91 return response.text().await.map_err(|e| {
92 error!(error = %e, "Failed to read response body");
93 PreviewError::FetchError(e.to_string())
94 });
95 }
96
97 if attempt < max_retries - 1 {
98 warn!(
99 status = %response.status(),
100 attempt = attempt + 1,
101 "Request failed, retrying after delay"
102 );
103 tokio::time::sleep(delay).await;
104 delay *= 2;
105 continue;
106 }
107 }
108 Err(e) => {
109 if attempt < max_retries - 1 {
110 warn!(
111 error = %e,
112 attempt = attempt + 1,
113 "Request error, retrying after delay"
114 );
115 tokio::time::sleep(delay).await;
116 delay *= 2;
117 continue;
118 }
119 error!(error = %e, "Max retries exceeded");
120 return Err(PreviewError::FetchError(e.to_string()));
121 }
122 }
123 }
124
125 error!("Failed to fetch URL after maximum retries");
126 Err(PreviewError::FetchError("Max retries exceeded".to_string()))
127 }
128
129 #[instrument(level = "debug", skip(self), err)]
130 pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
131 debug!(url = %url, "Starting fetch request");
132
133 if is_twitter_url(url) {
134 debug!(url = %url, "Detected Twitter URL, using oEmbed API");
135 let oembed = self.fetch_twitter_oembed(url).await?;
136 Ok(FetchResult::OEmbed(oembed))
137 } else {
138 debug!(url = %url, "Fetching regular webpage");
139 let content = self
140 .client
141 .get(url)
142 .send()
143 .await
144 .map_err(|e| {
145 error!(error = %e, url = %url, "Failed to send request");
146 PreviewError::FetchError(e.to_string())
147 })?
148 .text()
149 .await
150 .map_err(|e| {
151 error!(error = %e, url = %url, "Failed to read response body");
152 PreviewError::FetchError(e.to_string())
153 })?;
154
155 debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
156 Ok(FetchResult::Html(content))
157 }
158 }
159
160 #[instrument(level = "debug", skip(self), err)]
161 async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
162 let oembed_url = format!(
163 "https://publish.twitter.com/oembed?url={}&omit_script=1&lang=en",
164 tweet_url
165 );
166
167 debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
168
169 let response = self.client.get(&oembed_url).send().await.map_err(|e| {
170 error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
171 PreviewError::ExternalServiceError {
172 service: "Twitter".to_string(),
173 message: e.to_string(),
174 }
175 })?;
176
177 let oembed: OEmbedResponse = response.json().await.map_err(|e| {
178 error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
179 PreviewError::ExternalServiceError {
180 service: "Twitter".to_string(),
181 message: e.to_string(),
182 }
183 })?;
184
185 debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
186 Ok(oembed)
187 }
188}
189
190impl Fetcher {
192 #[instrument(level = "debug")]
193 pub fn new_twitter_client() -> Self {
194 debug!("Creating Twitter-specific fetcher");
195
196 let mut headers = HeaderMap::new();
197
198 headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
199 headers.insert(
200 "Accept",
201 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
202 .parse()
203 .unwrap(),
204 );
205
206 headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
207 headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
208 headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
209 headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
210 headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
211
212 headers.insert("Cache-Control", "no-cache".parse().unwrap());
213 headers.insert("Pragma", "no-cache".parse().unwrap());
214
215 let client = Client::builder()
216 .user_agent(
217 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
218 AppleWebKit/537.36 (KHTML, like Gecko) \
219 Chrome/119.0.0.0 Safari/537.36",
220 )
221 .timeout(Duration::from_secs(30))
222 .redirect(reqwest::redirect::Policy::limited(10))
223 .default_headers(headers)
224 .build()
225 .expect("Failed to create Twitter HTTP client");
226
227 debug!("Twitter-specific fetcher created successfully");
228 Self { client }
229 }
230
231 pub fn new_with_config(config: FetcherConfig) -> Self {
234 let mut client_builder = Client::builder()
235 .user_agent(config.user_agent)
236 .timeout(config.timeout);
237
238 if let Some(headers) = config.headers {
240 client_builder = client_builder.default_headers(headers);
241 }
242
243 if let Some(redirect_policy) = config.redirect_policy {
245 client_builder = client_builder.redirect(redirect_policy);
246 }
247
248 let client = client_builder
249 .build()
250 .expect("Failed to create HTTP client with custom config");
251
252 Self { client }
253 }
254}
255
256impl Fetcher {
258 pub fn new_github_client() -> Self {
259 debug!("Creating GitHub-specific client");
260
261 let mut headers = HeaderMap::new();
262 headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
263
264 if let Ok(token) = std::env::var("GITHUB_TOKEN") {
265 debug!("Found GitHub token in environment");
266 headers.insert(
267 "Authorization",
268 format!("Bearer {}", token).parse().unwrap(),
269 );
270 }
271
272 let client = Client::builder()
273 .user_agent("url_preview/1.0")
274 .default_headers(headers)
275 .timeout(Duration::from_secs(10))
276 .build()
277 .unwrap();
278
279 Self { client }
280 }
281
282 pub async fn fetch_github_repo(
283 &self,
284 owner: &str,
285 repo: &str,
286 ) -> Result<GitHubRepository, PreviewError> {
287 let url = format!("https://api.github.com/repos/{}/{}", owner, repo);
288 debug!(url = %url, "Fetching GitHub repository information");
289
290 let response =
291 self.client.get(&url).send().await.map_err(|e| {
292 PreviewError::FetchError(format!("GitHub API request failed: {}", e))
293 })?;
294
295 if !response.status().is_success() {
296 return Err(PreviewError::FetchError(format!(
297 "GitHub API returned status: {}",
298 response.status()
299 )));
300 }
301
302 response.json::<GitHubRepository>().await.map_err(|e| {
303 PreviewError::ExtractError(format!("Failed to parse GitHub response: {}", e))
304 })
305 }
306}
307
308pub struct FetcherConfig {
326 pub user_agent: String,
327 pub timeout: Duration,
328 pub headers: Option<HeaderMap>,
329 pub redirect_policy: Option<reqwest::redirect::Policy>,
330}
331
332impl Default for FetcherConfig {
333 fn default() -> Self {
334 Self {
335 user_agent: "url_preview/0.1.0".to_string(),
336 timeout: Duration::from_secs(10),
337 headers: None,
338 redirect_policy: None,
339 }
340 }
341}
342
343impl Fetcher {
345 pub async fn fetch_github_basic_preview(
346 &self,
347 owner: &str,
348 repo: &str,
349 ) -> Result<GitHubBasicPreview, PreviewError> {
350 let url = format!("https://github.com/{}/{}", owner, repo);
351 debug!("Fetching basic preview for repository: {}/{}", owner, repo);
352
353 let response =
354 self.client.get(&url).send().await.map_err(|e| {
355 PreviewError::FetchError(format!("Failed to fetch GitHub page: {}", e))
356 })?;
357
358 let html = response.text().await.map_err(|e| {
359 PreviewError::FetchError(format!("Failed to read response body: {}", e))
360 })?;
361
362 let document = Html::parse_document(&html);
363
364 let title = self.extract_title(&document)?;
365 let description = self.extract_description(&document);
366 let image_url = self.extract_og_image(&document);
367
368 if let Some(ref url) = image_url {
369 debug!("Found GitHub Reop Preview Image URL: {}", url);
370 } else {
371 warn!("Not Found GitHub Reop Preview Image URL");
372 }
373
374 Ok(GitHubBasicPreview {
375 title,
376 description,
377 image_url,
378 })
379 }
380
381 pub async fn fetch_github_detailed_info(
382 &self,
383 owner: &str,
384 repo: &str,
385 ) -> Result<GitHubDetailedInfo, PreviewError> {
386 let api_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
387 debug!("Fetching detailed info from GitHub API: {}", api_url);
388
389 let response = self
390 .client
391 .get(&api_url)
392 .header("Accept", "application/vnd.github.v3+json")
393 .send()
394 .await
395 .map_err(|e| PreviewError::FetchError(format!("GitHub API request failed: {}", e)))?;
396
397 let repo_data: serde_json::Value = response.json().await.map_err(|e| {
398 PreviewError::ExtractError(format!("Failed to parse GitHub API response: {}", e))
399 })?;
400
401 let contributors_url = format!("{}/contributors?per_page=1", api_url);
402 let contributors_count = self.get_contributors_count(&contributors_url).await?;
403
404 Ok(GitHubDetailedInfo {
405 stars_count: repo_data["stargazers_count"].as_u64().unwrap_or(0) as u32,
406 forks_count: repo_data["forks_count"].as_u64().unwrap_or(0) as u32,
407 contributors_count,
408 issues_count: repo_data["open_issues_count"].as_u64().unwrap_or(0) as u32,
409 discussions_count: repo_data["discussions_count"].as_u64().unwrap_or(0) as u32,
410 primary_language: repo_data["language"].as_str().map(String::from),
411 })
412 }
413
414 fn extract_title(&self, document: &Html) -> Result<String, PreviewError> {
415 let og_title_selector = Selector::parse("meta[property='og:title']")
416 .map_err(|e| PreviewError::ExtractError(format!("Invalid selector: {}", e)))?;
417
418 document
419 .select(&og_title_selector)
420 .next()
421 .and_then(|el| el.value().attr("content"))
422 .map(String::from)
423 .ok_or_else(|| PreviewError::ExtractError("Title not found".into()))
424 }
425
426 fn extract_description(&self, document: &Html) -> Option<String> {
427 let selector = Selector::parse("meta[property='og:description']").ok()?;
428 document
429 .select(&selector)
430 .next()
431 .and_then(|el| el.value().attr("content"))
432 .map(String::from)
433 }
434
435 fn extract_og_image(&self, document: &Html) -> Option<String> {
436 let twitter_image_selector = Selector::parse("meta[name='twitter:image']").ok()?;
437
438 if let Some(url) = document
439 .select(&twitter_image_selector)
440 .next()
441 .and_then(|el| el.value().attr("content"))
442 {
443 debug!("Found Twitter image URL: {}", url);
444 return Some(url.to_string());
445 }
446
447 let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
449
450 document
451 .select(&og_image_selector)
452 .next()
453 .and_then(|el| el.value().attr("content"))
454 .map(|url| {
455 debug!("Found Open Graph image URL: {}", url);
456 url.to_string()
457 })
458 }
459
460 async fn get_contributors_count(&self, url: &str) -> Result<u32, PreviewError> {
461 let response = self.client.get(url).send().await.map_err(|e| {
462 PreviewError::FetchError(format!("Failed to fetch contributors: {}", e))
463 })?;
464
465 if let Some(link_header) = response.headers().get("Link") {
466 if let Ok(link_str) = link_header.to_str() {
467 if let Some(last_page) = parse_github_link_header(link_str) {
468 return Ok(last_page);
469 }
470 }
471 }
472
473 Ok(1)
474 }
475}
476
477fn parse_github_link_header(link_str: &str) -> Option<u32> {
478 for link in link_str.split(',') {
483 if link.contains("rel=\"last\"") {
484 if let Some(page) = link
485 .split(';')
486 .next().map(|url| url.trim_matches(|c| c == '<' || c == '>' || c == ' '))
487 .and_then(|url| url.split('=').last())
488 .and_then(|page| page.parse().ok())
489 {
490 return Some(page);
491 }
492 }
493 }
494 None
495}