1use super::is_twitter_url;
2#[cfg(feature = "github")]
3use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
4use crate::PreviewError;
5use reqwest::{header::HeaderMap, Client};
6use scraper::{Html, Selector};
7use serde::Deserialize;
8use std::time::Duration;
9#[cfg(feature = "logging")]
10use tracing::{debug, error, instrument, warn};
11
12#[derive(Debug, Clone, Deserialize)]
13pub struct OEmbedResponse {
14 pub html: String,
15 #[serde(default)]
16 pub author_name: String,
17 #[serde(default)]
18 pub author_url: String,
19 pub provider_name: String,
20 pub provider_url: String,
21}
22
23#[derive(Clone)]
24pub struct Fetcher {
25 client: Client,
26}
27
28#[derive(Debug, Clone)]
29pub enum FetchResult {
30 Html(String),
31 OEmbed(OEmbedResponse),
32}
33
34impl Default for Fetcher {
35 fn default() -> Self {
36 Self::new()
37 }
38}
39
40impl Fetcher {
41 pub fn new() -> Self {
42 let user_agent = "url_preview/0.1.0";
43 let timeout = Duration::from_secs(10);
44 #[cfg(feature = "logging")]
45 debug!("Fetcher initialized with default configuration");
46
47 Self::new_with_custom_config(timeout, user_agent)
48 }
49
50 pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
51 let client = Client::builder()
52 .timeout(timeout)
53 .user_agent(user_agent)
54 .pool_max_idle_per_host(10)
55 .build()
56 .unwrap_or_else(|e| {
57 #[cfg(feature = "logging")]
58 error!(error = %e, "Failed to create HTTP client");
59 panic!("Failed to initialize HTTP client: {e}");
60 });
61 Fetcher { client }
62 }
63
64 pub fn with_client(client: Client) -> Self {
65 Self { client }
66 }
67
68 pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
69 let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
70 let results = futures::future::join_all(futures).await;
71
72 let mut responses = Vec::new();
73 for result in results {
74 match result {
75 Ok(response) => responses.push(response),
76 Err(e) => return Err(e),
77 }
78 }
79
80 Ok(responses)
81 }
82
83 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
84 pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
85 let max_retries = 3;
86 let mut delay = Duration::from_millis(1000);
87
88 for attempt in 0..max_retries {
89 #[cfg(feature = "logging")]
90 debug!(attempt = attempt + 1, "Attempting to fetch URL");
91
92 match self.client.get(url).send().await {
93 Ok(response) => {
94 if response.status() == 404 {
96 return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
97 }
98
99 if response.status().is_success() {
100 #[cfg(feature = "logging")]
101 debug!(url = %url, "Successfully fetched URL");
102 return response.text().await.map_err(|e| {
103 #[cfg(feature = "logging")]
104 error!(error = %e, "Failed to read response body");
105 PreviewError::FetchError(e.to_string())
106 });
107 }
108
109 if response.status().is_server_error() && attempt < max_retries - 1 {
111 #[cfg(feature = "logging")]
112 warn!(
113 status = %response.status(),
114 attempt = attempt + 1,
115 "Server error, retrying after delay"
116 );
117 tokio::time::sleep(delay).await;
118 delay *= 2;
119 continue;
120 }
121
122 let status = response.status().as_u16();
124 let message = format!("Server returned status: {}", response.status());
125 return Err(match status {
126 400..=499 => PreviewError::ClientError { status, message },
127 500..=599 => PreviewError::ServerError { status, message },
128 _ => PreviewError::HttpError { status, message },
129 });
130 }
131 Err(e) => {
132 let preview_error = PreviewError::from_reqwest_error(e);
133
134 let should_retry = matches!(
136 &preview_error,
137 PreviewError::ServerError { .. }
138 | PreviewError::TimeoutError(_)
139 | PreviewError::ConnectionError(_)
140 );
141
142 if should_retry && attempt < max_retries - 1 {
143 #[cfg(feature = "logging")]
144 warn!(
145 error = %preview_error,
146 attempt = attempt + 1,
147 "Request error, retrying after delay"
148 );
149 tokio::time::sleep(delay).await;
150 delay *= 2;
151 continue;
152 }
153 #[cfg(feature = "logging")]
154 error!(error = %preview_error, "Request failed");
155 return Err(preview_error);
156 }
157 }
158 }
159
160 #[cfg(feature = "logging")]
161 error!("Failed to fetch URL after maximum retries");
162 Err(PreviewError::FetchError("Max retries exceeded".to_string()))
163 }
164
165 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
166 pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
167 #[cfg(feature = "logging")]
168 debug!(url = %url, "Starting fetch request");
169
170 if is_twitter_url(url) {
171 #[cfg(feature = "logging")]
172 debug!(url = %url, "Detected Twitter URL, using oEmbed API");
173 #[cfg(feature = "twitter")]
174 {
175 let oembed = self.fetch_twitter_oembed(url).await?;
176 Ok(FetchResult::OEmbed(oembed))
177 }
178 #[cfg(not(feature = "twitter"))]
179 {
180 self.fetch_html(url).await.map(FetchResult::Html)
182 }
183 } else {
184 #[cfg(feature = "logging")]
185 debug!(url = %url, "Fetching regular webpage");
186 self.fetch_html(url).await.map(FetchResult::Html)
187 }
188 }
189
190 async fn fetch_html(&self, url: &str) -> Result<String, PreviewError> {
191 let response = self.client.get(url).send().await.map_err(|e| {
192 #[cfg(feature = "logging")]
193 error!(error = %e, url = %url, "Failed to send request");
194 PreviewError::from_reqwest_error(e)
195 })?;
196
197 if response.status() == 404 {
199 return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
200 }
201
202 if !response.status().is_success() {
203 let status = response.status().as_u16();
204 let message = format!("Server returned status: {}", response.status());
205
206 return Err(match status {
207 400..=499 => PreviewError::ClientError { status, message },
208 500..=599 => PreviewError::ServerError { status, message },
209 _ => PreviewError::HttpError { status, message },
210 });
211 }
212
213 let content = response.text().await.map_err(|e| {
214 #[cfg(feature = "logging")]
215 error!(error = %e, url = %url, "Failed to read response body");
216 PreviewError::FetchError(e.to_string())
217 })?;
218
219 #[cfg(feature = "logging")]
220 debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
221 Ok(content)
222 }
223
224 #[cfg(feature = "twitter")]
225 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
226 async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
227 let oembed_url = format!(
228 "https://publish.twitter.com/oembed?url={tweet_url}&omit_script=1&lang=en"
229 );
230
231 #[cfg(feature = "logging")]
232 debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
233
234 let response = self.client.get(&oembed_url).send().await.map_err(|e| {
235 #[cfg(feature = "logging")]
236 error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
237 let inner_error = PreviewError::from_reqwest_error(e);
239 match inner_error {
240 PreviewError::DnsError(msg) => PreviewError::ExternalServiceError {
241 service: "Twitter".to_string(),
242 message: format!("DNS error: {msg}"),
243 },
244 PreviewError::TimeoutError(msg) => PreviewError::ExternalServiceError {
245 service: "Twitter".to_string(),
246 message: format!("Timeout: {msg}"),
247 },
248 PreviewError::ConnectionError(msg) => PreviewError::ExternalServiceError {
249 service: "Twitter".to_string(),
250 message: format!("Connection error: {msg}"),
251 },
252 _ => PreviewError::ExternalServiceError {
253 service: "Twitter".to_string(),
254 message: inner_error.to_string(),
255 },
256 }
257 })?;
258
259 if response.status() == 404 {
261 return Err(PreviewError::NotFound(format!(
262 "Twitter/X content not found: {tweet_url}"
263 )));
264 }
265
266 if !response.status().is_success() {
267 let status = response.status().as_u16();
268 let message = format!("Twitter API returned status: {}", response.status());
269
270 return Err(PreviewError::ExternalServiceError {
272 service: "Twitter".to_string(),
273 message: match status {
274 400..=499 => format!("Client error ({status}): {message}"),
275 500..=599 => format!("Server error ({status}): {message}"),
276 _ => format!("HTTP error ({status}): {message}"),
277 },
278 });
279 }
280
281 let oembed: OEmbedResponse = response.json().await.map_err(|e| {
282 #[cfg(feature = "logging")]
283 error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
284 PreviewError::ExternalServiceError {
285 service: "Twitter".to_string(),
286 message: e.to_string(),
287 }
288 })?;
289
290 #[cfg(feature = "logging")]
291 debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
292 Ok(oembed)
293 }
294}
295
296#[cfg(feature = "twitter")]
298impl Fetcher {
299 #[cfg_attr(feature = "logging", instrument(level = "debug"))]
300 pub fn new_twitter_client() -> Self {
301 #[cfg(feature = "logging")]
302 debug!("Creating Twitter-specific fetcher");
303
304 let mut headers = HeaderMap::new();
305
306 headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
307 headers.insert(
308 "Accept",
309 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
310 .parse()
311 .unwrap(),
312 );
313
314 headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
315 headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
316 headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
317 headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
318 headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
319
320 headers.insert("Cache-Control", "no-cache".parse().unwrap());
321 headers.insert("Pragma", "no-cache".parse().unwrap());
322
323 let client = Client::builder()
324 .user_agent(
325 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
326 AppleWebKit/537.36 (KHTML, like Gecko) \
327 Chrome/119.0.0.0 Safari/537.36",
328 )
329 .timeout(Duration::from_secs(30))
330 .redirect(reqwest::redirect::Policy::limited(10))
331 .default_headers(headers)
332 .build()
333 .expect("Failed to create Twitter HTTP client");
334
335 #[cfg(feature = "logging")]
336 debug!("Twitter-specific fetcher created successfully");
337 Self { client }
338 }
339
340 pub fn new_with_config(config: FetcherConfig) -> Self {
343 let mut client_builder = Client::builder()
344 .user_agent(config.user_agent)
345 .timeout(config.timeout);
346
347 if let Some(headers) = config.headers {
349 client_builder = client_builder.default_headers(headers);
350 }
351
352 if let Some(redirect_policy) = config.redirect_policy {
354 client_builder = client_builder.redirect(redirect_policy);
355 }
356
357 let client = client_builder
358 .build()
359 .expect("Failed to create HTTP client with custom config");
360
361 Self { client }
362 }
363}
364
365#[cfg(feature = "github")]
367impl Fetcher {
368 pub fn new_github_client() -> Self {
369 #[cfg(feature = "logging")]
370 debug!("Creating GitHub-specific client");
371
372 let mut headers = HeaderMap::new();
373 headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
374
375 if let Ok(token) = std::env::var("GITHUB_TOKEN") {
376 #[cfg(feature = "logging")]
377 debug!("Found GitHub token in environment");
378 headers.insert(
379 "Authorization",
380 format!("Bearer {token}").parse().unwrap(),
381 );
382 }
383
384 let client = Client::builder()
385 .user_agent("url_preview/1.0")
386 .default_headers(headers)
387 .timeout(Duration::from_secs(10))
388 .build()
389 .expect("Failed to create GitHub HTTP client");
390
391 Self { client }
392 }
393
394 pub async fn fetch_github_repo(
395 &self,
396 owner: &str,
397 repo: &str,
398 ) -> Result<GitHubRepository, PreviewError> {
399 let url = format!("https://api.github.com/repos/{owner}/{repo}");
400 #[cfg(feature = "logging")]
401 debug!(url = %url, "Fetching GitHub repository information");
402
403 let response = self
404 .client
405 .get(&url)
406 .send()
407 .await
408 .map_err(PreviewError::from_reqwest_error)?;
409
410 if response.status() == 404 {
412 return Err(PreviewError::NotFound(format!(
413 "GitHub repository {owner}/{repo} not found"
414 )));
415 }
416
417 if !response.status().is_success() {
418 let status = response.status().as_u16();
419 let message = format!("API returned status: {}", response.status());
420
421 return Err(match status {
422 400..=499 => PreviewError::ClientError { status, message },
423 500..=599 => PreviewError::ServerError { status, message },
424 _ => PreviewError::HttpError { status, message },
425 });
426 }
427
428 let repo_info: GitHubRepository = response
429 .json()
430 .await
431 .map_err(|e| PreviewError::ParseError(e.to_string()))?;
432
433 Ok(repo_info)
434 }
435
436 pub fn parse_github_url(url: &str) -> Option<(String, String)> {
441 let parts: Vec<&str> = url
442 .trim_start_matches("https://")
443 .trim_start_matches("github.com/")
444 .split('/')
445 .collect();
446
447 if parts.len() >= 2 {
448 return Some((parts[0].to_string(), parts[1].to_string()));
449 }
450
451 None
452 }
453
454 fn extract_og_image(html: &str) -> Option<String> {
456 let document = Html::parse_document(html);
457 let selector = Selector::parse("meta[property='og:image']").ok()?;
458
459 document
460 .select(&selector)
461 .next()
462 .and_then(|elem| elem.value().attr("content"))
463 .map(|s| s.to_string())
464 }
465
466 pub async fn fetch_github_basic_preview(
468 &self,
469 owner: &str,
470 repo: &str,
471 ) -> Result<GitHubBasicPreview, PreviewError> {
472 let url = format!("https://github.com/{owner}/{repo}");
473 #[cfg(feature = "logging")]
474 debug!("Fetching basic preview for repository: {}/{}", owner, repo);
475
476 let response = self
477 .client
478 .get(&url)
479 .send()
480 .await
481 .map_err(PreviewError::from_reqwest_error)?;
482
483 if response.status() == 404 {
485 return Err(PreviewError::NotFound(format!(
486 "GitHub repository {owner}/{repo} not found"
487 )));
488 }
489
490 if !response.status().is_success() {
491 return Err(PreviewError::FetchError(format!(
492 "GitHub returned status: {}",
493 response.status()
494 )));
495 }
496
497 let html = response
498 .text()
499 .await
500 .map_err(|e| PreviewError::FetchError(e.to_string()))?;
501
502 let document = Html::parse_document(&html);
503
504 let title = Self::extract_meta_content(&document, "meta[property='og:title']");
506 let description = Self::extract_meta_content(&document, "meta[property='og:description']");
507 let image_url = Self::extract_og_image(&html);
508
509 #[cfg(feature = "logging")]
510 {
511 if let Some(ref url) = image_url {
512 debug!("Found GitHub Reop Preview Image URL: {}", url);
513 } else {
514 warn!("Not Found GitHub Reop Preview Image URL");
515 }
516 }
517
518 Ok(GitHubBasicPreview {
519 title,
520 description,
521 image_url,
522 })
523 }
524
525 pub async fn fetch_github_detailed_info(
527 &self,
528 owner: &str,
529 repo: &str,
530 ) -> Result<GitHubDetailedInfo, PreviewError> {
531 let api_url = format!("https://api.github.com/repos/{owner}/{repo}");
532 #[cfg(feature = "logging")]
533 debug!("Fetching detailed info from GitHub API: {}", api_url);
534
535 let response = self
536 .client
537 .get(&api_url)
538 .send()
539 .await
540 .map_err(PreviewError::from_reqwest_error)?;
541
542 if response.status() == 404 {
544 return Err(PreviewError::NotFound(format!(
545 "GitHub repository {owner}/{repo} not found"
546 )));
547 }
548
549 if !response.status().is_success() {
550 let status = response.status().as_u16();
551 let message = format!("API returned status: {}", response.status());
552
553 return Err(match status {
554 400..=499 => PreviewError::ClientError { status, message },
555 500..=599 => PreviewError::ServerError { status, message },
556 _ => PreviewError::HttpError { status, message },
557 });
558 }
559
560 let data: serde_json::Value = response
561 .json()
562 .await
563 .map_err(|e| PreviewError::ParseError(e.to_string()))?;
564
565 Ok(GitHubDetailedInfo {
566 full_name: data["full_name"].as_str().unwrap_or("").to_string(),
567 description: data["description"]
568 .as_str()
569 .map(|s| s.to_string())
570 .unwrap_or_default(),
571 stars_count: data["stargazers_count"].as_u64().unwrap_or(0) as u32,
572 forks_count: data["forks_count"].as_u64().unwrap_or(0) as u32,
573 open_issues_count: data["open_issues_count"].as_u64().unwrap_or(0) as u32,
574 language: data["language"].as_str().map(|s| s.to_string()),
575 default_branch: data["default_branch"]
576 .as_str()
577 .unwrap_or("main")
578 .to_string(),
579 topics: data["topics"]
580 .as_array()
581 .map(|arr| {
582 arr.iter()
583 .filter_map(|v| v.as_str().map(|s| s.to_string()))
584 .collect()
585 })
586 .unwrap_or_default(),
587 html_url: data["html_url"].as_str().unwrap_or(&api_url).to_string(),
588 homepage: data["homepage"]
589 .as_str()
590 .filter(|s| !s.is_empty())
591 .map(|s| s.to_string()),
592 })
593 }
594
595 fn extract_meta_content(document: &Html, selector_str: &str) -> Option<String> {
596 let selector = Selector::parse(selector_str).ok()?;
597 document
598 .select(&selector)
599 .next()
600 .and_then(|elem| elem.value().attr("content"))
601 .map(|s| s.to_string())
602 }
603}
604
605impl Fetcher {
607 pub fn extract_twitter_image_from_html(html: &str) -> Option<String> {
608 let document = Html::parse_document(html);
609 let selector = Selector::parse("meta[name='twitter:image']").ok()?;
610
611 if let Some(url) = document
612 .select(&selector)
613 .next()
614 .and_then(|elem| elem.value().attr("content"))
615 {
616 #[cfg(feature = "logging")]
617 debug!("Found Twitter image URL: {}", url);
618 return Some(url.to_string());
619 }
620
621 let og_selector = Selector::parse("meta[property='og:image']").ok()?;
622 document
623 .select(&og_selector)
624 .next()
625 .and_then(|elem| elem.value().attr("content"))
626 .map(|url| {
627 #[cfg(feature = "logging")]
628 debug!("Found Open Graph image URL: {}", url);
629 url.to_string()
630 })
631 }
632}
633
634pub struct FetcherConfig {
636 pub user_agent: String,
637 pub timeout: Duration,
638 pub headers: Option<HeaderMap>,
639 pub redirect_policy: Option<reqwest::redirect::Policy>,
640}
641
642impl Default for FetcherConfig {
643 fn default() -> Self {
644 Self {
645 user_agent: "url_preview/0.1.0".to_string(),
646 timeout: Duration::from_secs(10),
647 headers: None,
648 redirect_policy: None,
649 }
650 }
651}