1use super::is_twitter_url;
2#[cfg(feature = "github")]
3use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
4use crate::PreviewError;
5use reqwest::{header::HeaderMap, Client};
6use scraper::{Html, Selector};
7use serde::Deserialize;
8use std::time::Duration;
9#[cfg(feature = "logging")]
10use tracing::{debug, error, instrument, warn};
11
12#[derive(Debug, Clone, Deserialize)]
13pub struct OEmbedResponse {
14 pub html: String,
15 #[serde(default)]
16 pub author_name: String,
17 #[serde(default)]
18 pub author_url: String,
19 pub provider_name: String,
20 pub provider_url: String,
21}
22
23#[derive(Clone)]
24pub struct Fetcher {
25 client: Client,
26}
27
28#[derive(Debug, Clone)]
29pub enum FetchResult {
30 Html(String),
31 OEmbed(OEmbedResponse),
32}
33
34impl Default for Fetcher {
35 fn default() -> Self {
36 Self::new()
37 }
38}
39
40impl Fetcher {
41 pub fn new() -> Self {
42 let user_agent = "url_preview/0.1.0";
43 let timeout = Duration::from_secs(10);
44 #[cfg(feature = "logging")]
45 debug!("Fetcher initialized with default configuration");
46
47 Self::new_with_custom_config(timeout, user_agent)
48 }
49
50 pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
51 let client = Client::builder()
52 .timeout(timeout)
53 .user_agent(user_agent)
54 .pool_max_idle_per_host(10)
55 .build()
56 .unwrap_or_else(|e| {
57 #[cfg(feature = "logging")]
58 error!(error = %e, "Failed to create HTTP client");
59 panic!("Failed to initialize HTTP client: {}", e);
60 });
61 Fetcher { client }
62 }
63
64 pub fn with_client(client: Client) -> Self {
65 Self { client }
66 }
67
68 pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
69 let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
70 let results = futures::future::join_all(futures).await;
71
72 let mut responses = Vec::new();
73 for result in results {
74 match result {
75 Ok(response) => responses.push(response),
76 Err(e) => return Err(e),
77 }
78 }
79
80 Ok(responses)
81 }
82
83 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
84 pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
85 let max_retries = 3;
86 let mut delay = Duration::from_millis(1000);
87
88 for attempt in 0..max_retries {
89 #[cfg(feature = "logging")]
90 debug!(attempt = attempt + 1, "Attempting to fetch URL");
91
92 match self.client.get(url).send().await {
93 Ok(response) => {
94 if response.status() == 404 {
96 return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
97 }
98
99 if response.status().is_success() {
100 #[cfg(feature = "logging")]
101 debug!(url = %url, "Successfully fetched URL");
102 return response.text().await.map_err(|e| {
103 #[cfg(feature = "logging")]
104 error!(error = %e, "Failed to read response body");
105 PreviewError::FetchError(e.to_string())
106 });
107 }
108
109 if response.status().is_server_error() && attempt < max_retries - 1 {
111 #[cfg(feature = "logging")]
112 warn!(
113 status = %response.status(),
114 attempt = attempt + 1,
115 "Server error, retrying after delay"
116 );
117 tokio::time::sleep(delay).await;
118 delay *= 2;
119 continue;
120 }
121
122 let status = response.status().as_u16();
124 let message = format!("Server returned status: {}", response.status());
125 return Err(match status {
126 400..=499 => PreviewError::ClientError { status, message },
127 500..=599 => PreviewError::ServerError { status, message },
128 _ => PreviewError::HttpError { status, message },
129 });
130 }
131 Err(e) => {
132 let preview_error = PreviewError::from_reqwest_error(e);
133
134 let should_retry = matches!(
136 &preview_error,
137 PreviewError::ServerError { .. }
138 | PreviewError::TimeoutError(_)
139 | PreviewError::ConnectionError(_)
140 );
141
142 if should_retry && attempt < max_retries - 1 {
143 #[cfg(feature = "logging")]
144 warn!(
145 error = %preview_error,
146 attempt = attempt + 1,
147 "Request error, retrying after delay"
148 );
149 tokio::time::sleep(delay).await;
150 delay *= 2;
151 continue;
152 }
153 #[cfg(feature = "logging")]
154 error!(error = %preview_error, "Request failed");
155 return Err(preview_error);
156 }
157 }
158 }
159
160 #[cfg(feature = "logging")]
161 error!("Failed to fetch URL after maximum retries");
162 Err(PreviewError::FetchError("Max retries exceeded".to_string()))
163 }
164
165 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
166 pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
167 #[cfg(feature = "logging")]
168 debug!(url = %url, "Starting fetch request");
169
170 if is_twitter_url(url) {
171 #[cfg(feature = "logging")]
172 debug!(url = %url, "Detected Twitter URL, using oEmbed API");
173 #[cfg(feature = "twitter")]
174 {
175 let oembed = self.fetch_twitter_oembed(url).await?;
176 Ok(FetchResult::OEmbed(oembed))
177 }
178 #[cfg(not(feature = "twitter"))]
179 {
180 self.fetch_html(url).await.map(FetchResult::Html)
182 }
183 } else {
184 #[cfg(feature = "logging")]
185 debug!(url = %url, "Fetching regular webpage");
186 self.fetch_html(url).await.map(FetchResult::Html)
187 }
188 }
189
190 async fn fetch_html(&self, url: &str) -> Result<String, PreviewError> {
191 let response = self.client.get(url).send().await.map_err(|e| {
192 #[cfg(feature = "logging")]
193 error!(error = %e, url = %url, "Failed to send request");
194 PreviewError::from_reqwest_error(e)
195 })?;
196
197 if response.status() == 404 {
199 return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
200 }
201
202 if !response.status().is_success() {
203 let status = response.status().as_u16();
204 let message = format!("Server returned status: {}", response.status());
205
206 return Err(match status {
207 400..=499 => PreviewError::ClientError { status, message },
208 500..=599 => PreviewError::ServerError { status, message },
209 _ => PreviewError::HttpError { status, message },
210 });
211 }
212
213 let content = response.text().await.map_err(|e| {
214 #[cfg(feature = "logging")]
215 error!(error = %e, url = %url, "Failed to read response body");
216 PreviewError::FetchError(e.to_string())
217 })?;
218
219 #[cfg(feature = "logging")]
220 debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
221 Ok(content)
222 }
223
224 #[cfg(feature = "twitter")]
225 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
226 async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
227 let oembed_url = format!(
228 "https://publish.twitter.com/oembed?url={}&omit_script=1&lang=en",
229 tweet_url
230 );
231
232 #[cfg(feature = "logging")]
233 debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
234
235 let response = self.client.get(&oembed_url).send().await.map_err(|e| {
236 #[cfg(feature = "logging")]
237 error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
238 let inner_error = PreviewError::from_reqwest_error(e);
240 match inner_error {
241 PreviewError::DnsError(msg) => PreviewError::ExternalServiceError {
242 service: "Twitter".to_string(),
243 message: format!("DNS error: {}", msg),
244 },
245 PreviewError::TimeoutError(msg) => PreviewError::ExternalServiceError {
246 service: "Twitter".to_string(),
247 message: format!("Timeout: {}", msg),
248 },
249 PreviewError::ConnectionError(msg) => PreviewError::ExternalServiceError {
250 service: "Twitter".to_string(),
251 message: format!("Connection error: {}", msg),
252 },
253 _ => PreviewError::ExternalServiceError {
254 service: "Twitter".to_string(),
255 message: inner_error.to_string(),
256 },
257 }
258 })?;
259
260 if response.status() == 404 {
262 return Err(PreviewError::NotFound(format!(
263 "Twitter/X content not found: {tweet_url}"
264 )));
265 }
266
267 if !response.status().is_success() {
268 let status = response.status().as_u16();
269 let message = format!("Twitter API returned status: {}", response.status());
270
271 return Err(PreviewError::ExternalServiceError {
273 service: "Twitter".to_string(),
274 message: match status {
275 400..=499 => format!("Client error ({}): {}", status, message),
276 500..=599 => format!("Server error ({}): {}", status, message),
277 _ => format!("HTTP error ({}): {}", status, message),
278 },
279 });
280 }
281
282 let oembed: OEmbedResponse = response.json().await.map_err(|e| {
283 #[cfg(feature = "logging")]
284 error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
285 PreviewError::ExternalServiceError {
286 service: "Twitter".to_string(),
287 message: e.to_string(),
288 }
289 })?;
290
291 #[cfg(feature = "logging")]
292 debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
293 Ok(oembed)
294 }
295}
296
297#[cfg(feature = "twitter")]
299impl Fetcher {
300 #[cfg_attr(feature = "logging", instrument(level = "debug"))]
301 pub fn new_twitter_client() -> Self {
302 #[cfg(feature = "logging")]
303 debug!("Creating Twitter-specific fetcher");
304
305 let mut headers = HeaderMap::new();
306
307 headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
308 headers.insert(
309 "Accept",
310 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
311 .parse()
312 .unwrap(),
313 );
314
315 headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
316 headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
317 headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
318 headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
319 headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
320
321 headers.insert("Cache-Control", "no-cache".parse().unwrap());
322 headers.insert("Pragma", "no-cache".parse().unwrap());
323
324 let client = Client::builder()
325 .user_agent(
326 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
327 AppleWebKit/537.36 (KHTML, like Gecko) \
328 Chrome/119.0.0.0 Safari/537.36",
329 )
330 .timeout(Duration::from_secs(30))
331 .redirect(reqwest::redirect::Policy::limited(10))
332 .default_headers(headers)
333 .build()
334 .expect("Failed to create Twitter HTTP client");
335
336 #[cfg(feature = "logging")]
337 debug!("Twitter-specific fetcher created successfully");
338 Self { client }
339 }
340
341 pub fn new_with_config(config: FetcherConfig) -> Self {
344 let mut client_builder = Client::builder()
345 .user_agent(config.user_agent)
346 .timeout(config.timeout);
347
348 if let Some(headers) = config.headers {
350 client_builder = client_builder.default_headers(headers);
351 }
352
353 if let Some(redirect_policy) = config.redirect_policy {
355 client_builder = client_builder.redirect(redirect_policy);
356 }
357
358 let client = client_builder
359 .build()
360 .expect("Failed to create HTTP client with custom config");
361
362 Self { client }
363 }
364}
365
366#[cfg(feature = "github")]
368impl Fetcher {
369 pub fn new_github_client() -> Self {
370 #[cfg(feature = "logging")]
371 debug!("Creating GitHub-specific client");
372
373 let mut headers = HeaderMap::new();
374 headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
375
376 if let Ok(token) = std::env::var("GITHUB_TOKEN") {
377 #[cfg(feature = "logging")]
378 debug!("Found GitHub token in environment");
379 headers.insert(
380 "Authorization",
381 format!("Bearer {}", token).parse().unwrap(),
382 );
383 }
384
385 let client = Client::builder()
386 .user_agent("url_preview/1.0")
387 .default_headers(headers)
388 .timeout(Duration::from_secs(10))
389 .build()
390 .expect("Failed to create GitHub HTTP client");
391
392 Self { client }
393 }
394
395 pub async fn fetch_github_repo(
396 &self,
397 owner: &str,
398 repo: &str,
399 ) -> Result<GitHubRepository, PreviewError> {
400 let url = format!("https://api.github.com/repos/{}/{}", owner, repo);
401 #[cfg(feature = "logging")]
402 debug!(url = %url, "Fetching GitHub repository information");
403
404 let response = self
405 .client
406 .get(&url)
407 .send()
408 .await
409 .map_err(PreviewError::from_reqwest_error)?;
410
411 if response.status() == 404 {
413 return Err(PreviewError::NotFound(format!(
414 "GitHub repository {owner}/{repo} not found"
415 )));
416 }
417
418 if !response.status().is_success() {
419 let status = response.status().as_u16();
420 let message = format!("API returned status: {}", response.status());
421
422 return Err(match status {
423 400..=499 => PreviewError::ClientError { status, message },
424 500..=599 => PreviewError::ServerError { status, message },
425 _ => PreviewError::HttpError { status, message },
426 });
427 }
428
429 let repo_info: GitHubRepository = response
430 .json()
431 .await
432 .map_err(|e| PreviewError::ParseError(e.to_string()))?;
433
434 Ok(repo_info)
435 }
436
437 pub fn parse_github_url(url: &str) -> Option<(String, String)> {
442 let parts: Vec<&str> = url
443 .trim_start_matches("https://")
444 .trim_start_matches("github.com/")
445 .split('/')
446 .collect();
447
448 if parts.len() >= 2 {
449 return Some((parts[0].to_string(), parts[1].to_string()));
450 }
451
452 None
453 }
454
455 fn extract_og_image(html: &str) -> Option<String> {
457 let document = Html::parse_document(html);
458 let selector = Selector::parse("meta[property='og:image']").ok()?;
459
460 document
461 .select(&selector)
462 .next()
463 .and_then(|elem| elem.value().attr("content"))
464 .map(|s| s.to_string())
465 }
466
467 pub async fn fetch_github_basic_preview(
469 &self,
470 owner: &str,
471 repo: &str,
472 ) -> Result<GitHubBasicPreview, PreviewError> {
473 let url = format!("https://github.com/{}/{}", owner, repo);
474 #[cfg(feature = "logging")]
475 debug!("Fetching basic preview for repository: {}/{}", owner, repo);
476
477 let response = self
478 .client
479 .get(&url)
480 .send()
481 .await
482 .map_err(PreviewError::from_reqwest_error)?;
483
484 if response.status() == 404 {
486 return Err(PreviewError::NotFound(format!(
487 "GitHub repository {owner}/{repo} not found"
488 )));
489 }
490
491 if !response.status().is_success() {
492 return Err(PreviewError::FetchError(format!(
493 "GitHub returned status: {}",
494 response.status()
495 )));
496 }
497
498 let html = response
499 .text()
500 .await
501 .map_err(|e| PreviewError::FetchError(e.to_string()))?;
502
503 let document = Html::parse_document(&html);
504
505 let title = Self::extract_meta_content(&document, "meta[property='og:title']");
507 let description = Self::extract_meta_content(&document, "meta[property='og:description']");
508 let image_url = Self::extract_og_image(&html);
509
510 #[cfg(feature = "logging")]
511 {
512 if let Some(ref url) = image_url {
513 debug!("Found GitHub Reop Preview Image URL: {}", url);
514 } else {
515 warn!("Not Found GitHub Reop Preview Image URL");
516 }
517 }
518
519 Ok(GitHubBasicPreview {
520 title,
521 description,
522 image_url,
523 })
524 }
525
526 pub async fn fetch_github_detailed_info(
528 &self,
529 owner: &str,
530 repo: &str,
531 ) -> Result<GitHubDetailedInfo, PreviewError> {
532 let api_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
533 #[cfg(feature = "logging")]
534 debug!("Fetching detailed info from GitHub API: {}", api_url);
535
536 let response = self
537 .client
538 .get(&api_url)
539 .send()
540 .await
541 .map_err(PreviewError::from_reqwest_error)?;
542
543 if response.status() == 404 {
545 return Err(PreviewError::NotFound(format!(
546 "GitHub repository {owner}/{repo} not found"
547 )));
548 }
549
550 if !response.status().is_success() {
551 let status = response.status().as_u16();
552 let message = format!("API returned status: {}", response.status());
553
554 return Err(match status {
555 400..=499 => PreviewError::ClientError { status, message },
556 500..=599 => PreviewError::ServerError { status, message },
557 _ => PreviewError::HttpError { status, message },
558 });
559 }
560
561 let data: serde_json::Value = response
562 .json()
563 .await
564 .map_err(|e| PreviewError::ParseError(e.to_string()))?;
565
566 Ok(GitHubDetailedInfo {
567 full_name: data["full_name"].as_str().unwrap_or("").to_string(),
568 description: data["description"]
569 .as_str()
570 .map(|s| s.to_string())
571 .unwrap_or_default(),
572 stars_count: data["stargazers_count"].as_u64().unwrap_or(0) as u32,
573 forks_count: data["forks_count"].as_u64().unwrap_or(0) as u32,
574 open_issues_count: data["open_issues_count"].as_u64().unwrap_or(0) as u32,
575 language: data["language"].as_str().map(|s| s.to_string()),
576 default_branch: data["default_branch"]
577 .as_str()
578 .unwrap_or("main")
579 .to_string(),
580 topics: data["topics"]
581 .as_array()
582 .map(|arr| {
583 arr.iter()
584 .filter_map(|v| v.as_str().map(|s| s.to_string()))
585 .collect()
586 })
587 .unwrap_or_default(),
588 html_url: data["html_url"].as_str().unwrap_or(&api_url).to_string(),
589 homepage: data["homepage"]
590 .as_str()
591 .filter(|s| !s.is_empty())
592 .map(|s| s.to_string()),
593 })
594 }
595
596 fn extract_meta_content(document: &Html, selector_str: &str) -> Option<String> {
597 let selector = Selector::parse(selector_str).ok()?;
598 document
599 .select(&selector)
600 .next()
601 .and_then(|elem| elem.value().attr("content"))
602 .map(|s| s.to_string())
603 }
604}
605
606impl Fetcher {
608 pub fn extract_twitter_image_from_html(html: &str) -> Option<String> {
609 let document = Html::parse_document(html);
610 let selector = Selector::parse("meta[name='twitter:image']").ok()?;
611
612 if let Some(url) = document
613 .select(&selector)
614 .next()
615 .and_then(|elem| elem.value().attr("content"))
616 {
617 #[cfg(feature = "logging")]
618 debug!("Found Twitter image URL: {}", url);
619 return Some(url.to_string());
620 }
621
622 let og_selector = Selector::parse("meta[property='og:image']").ok()?;
623 document
624 .select(&og_selector)
625 .next()
626 .and_then(|elem| elem.value().attr("content"))
627 .map(|url| {
628 #[cfg(feature = "logging")]
629 debug!("Found Open Graph image URL: {}", url);
630 url.to_string()
631 })
632 }
633}
634
635pub struct FetcherConfig {
637 pub user_agent: String,
638 pub timeout: Duration,
639 pub headers: Option<HeaderMap>,
640 pub redirect_policy: Option<reqwest::redirect::Policy>,
641}
642
643impl Default for FetcherConfig {
644 fn default() -> Self {
645 Self {
646 user_agent: "url_preview/0.1.0".to_string(),
647 timeout: Duration::from_secs(10),
648 headers: None,
649 redirect_policy: None,
650 }
651 }
652}