1use super::is_twitter_url;
2#[cfg(feature = "github")]
3use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
4use crate::{ContentLimits, PreviewError, UrlValidationConfig, UrlValidator};
5#[cfg(any(feature = "twitter", feature = "github"))]
6use reqwest::header::HeaderMap;
7use reqwest::{Client, Response};
8use scraper::{Html, Selector};
9use serde::Deserialize;
10use std::time::{Duration, Instant};
11use tokio::time::timeout;
12#[cfg(feature = "logging")]
13use tracing::{debug, error, instrument, warn};
14
15#[derive(Debug, Clone, Deserialize)]
16pub struct OEmbedResponse {
17 pub html: String,
18 #[serde(default)]
19 pub author_name: String,
20 #[serde(default)]
21 pub author_url: String,
22 pub provider_name: String,
23 pub provider_url: String,
24}
25
26#[derive(Clone)]
27pub struct Fetcher {
28 client: Client,
29 url_validator: UrlValidator,
30 content_limits: ContentLimits,
31}
32
33#[derive(Debug, Clone)]
34pub enum FetchResult {
35 Html(String),
36 OEmbed(OEmbedResponse),
37}
38
39#[derive(Debug, Clone)]
40pub struct FetcherConfig {
41 pub timeout: Duration,
42 pub user_agent: String,
43 pub url_validation: UrlValidationConfig,
44 pub content_limits: ContentLimits,
45}
46
47impl Default for FetcherConfig {
48 fn default() -> Self {
49 Self {
50 timeout: Duration::from_secs(10),
51 user_agent: "url_preview/0.1.0".to_string(),
52 url_validation: UrlValidationConfig::default(),
53 content_limits: ContentLimits::default(),
54 }
55 }
56}
57
58impl Default for Fetcher {
59 fn default() -> Self {
60 Self::new()
61 }
62}
63
64impl Fetcher {
65 pub fn new() -> Self {
66 Self::with_config(FetcherConfig::default())
67 }
68
69 pub fn with_config(config: FetcherConfig) -> Self {
70 let client = Client::builder()
71 .timeout(config.timeout)
72 .user_agent(&config.user_agent)
73 .pool_max_idle_per_host(10)
74 .build()
75 .unwrap_or_else(|e| {
76 #[cfg(feature = "logging")]
77 error!(error = %e, "Failed to create HTTP client");
78 panic!("Failed to initialize HTTP client: {e}");
79 });
80
81 #[cfg(feature = "logging")]
82 debug!("Fetcher initialized with custom configuration");
83
84 Fetcher {
85 client,
86 url_validator: UrlValidator::new(config.url_validation),
87 content_limits: config.content_limits,
88 }
89 }
90
91 pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
92 let config = FetcherConfig {
93 timeout,
94 user_agent: user_agent.to_string(),
95 ..Default::default()
96 };
97 Self::with_config(config)
98 }
99
100 pub fn with_client(client: Client) -> Self {
101 Self {
102 client,
103 url_validator: UrlValidator::with_default_config(),
104 content_limits: ContentLimits::default(),
105 }
106 }
107
108 pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
109 let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
110 let results = futures::future::join_all(futures).await;
111
112 let mut responses = Vec::new();
113 for result in results {
114 match result {
115 Ok(response) => responses.push(response),
116 Err(e) => return Err(e),
117 }
118 }
119
120 Ok(responses)
121 }
122
123 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
124 pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
125 let max_retries = 3;
126 let mut delay = Duration::from_millis(1000);
127
128 for attempt in 0..max_retries {
129 #[cfg(feature = "logging")]
130 debug!(attempt = attempt + 1, "Attempting to fetch URL");
131
132 match self.client.get(url).send().await {
133 Ok(response) => {
134 if response.status() == 404 {
136 return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
137 }
138
139 if response.status().is_success() {
140 #[cfg(feature = "logging")]
141 debug!(url = %url, "Successfully fetched URL");
142 return response.text().await.map_err(|e| {
143 #[cfg(feature = "logging")]
144 error!(error = %e, "Failed to read response body");
145 PreviewError::FetchError(e.to_string())
146 });
147 }
148
149 if response.status().is_server_error() && attempt < max_retries - 1 {
151 #[cfg(feature = "logging")]
152 warn!(
153 status = %response.status(),
154 attempt = attempt + 1,
155 "Server error, retrying after delay"
156 );
157 tokio::time::sleep(delay).await;
158 delay *= 2;
159 continue;
160 }
161
162 let status = response.status().as_u16();
164 let message = format!("Server returned status: {}", response.status());
165 return Err(match status {
166 400..=499 => PreviewError::ClientError { status, message },
167 500..=599 => PreviewError::ServerError { status, message },
168 _ => PreviewError::HttpError { status, message },
169 });
170 }
171 Err(e) => {
172 let preview_error = PreviewError::from_reqwest_error(e);
173
174 let should_retry = matches!(
176 &preview_error,
177 PreviewError::ServerError { .. }
178 | PreviewError::TimeoutError(_)
179 | PreviewError::ConnectionError(_)
180 );
181
182 if should_retry && attempt < max_retries - 1 {
183 #[cfg(feature = "logging")]
184 warn!(
185 error = %preview_error,
186 attempt = attempt + 1,
187 "Request error, retrying after delay"
188 );
189 tokio::time::sleep(delay).await;
190 delay *= 2;
191 continue;
192 }
193 #[cfg(feature = "logging")]
194 error!(error = %preview_error, "Request failed");
195 return Err(preview_error);
196 }
197 }
198 }
199
200 #[cfg(feature = "logging")]
201 error!("Failed to fetch URL after maximum retries");
202 Err(PreviewError::FetchError("Max retries exceeded".to_string()))
203 }
204
205 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
206 pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
207 let validated_url = self.url_validator.validate(url)?;
209 let url_str = validated_url.as_str();
210
211 #[cfg(feature = "logging")]
212 debug!(url = %url_str, "Starting fetch request after validation");
213
214 if is_twitter_url(url_str) {
215 #[cfg(feature = "logging")]
216 debug!(url = %url, "Detected Twitter URL, using oEmbed API");
217 #[cfg(feature = "twitter")]
218 {
219 let oembed = self.fetch_twitter_oembed(url_str).await?;
220 Ok(FetchResult::OEmbed(oembed))
221 }
222 #[cfg(not(feature = "twitter"))]
223 {
224 self.fetch_html(url_str).await.map(FetchResult::Html)
226 }
227 } else {
228 #[cfg(feature = "logging")]
229 debug!(url = %url, "Fetching regular webpage");
230 self.fetch_html(url).await.map(FetchResult::Html)
231 }
232 }
233
234 async fn fetch_html(&self, url: &str) -> Result<String, PreviewError> {
235 self.fetch_html_with_limits(url).await
236 }
237
238 async fn fetch_html_with_limits(&self, url: &str) -> Result<String, PreviewError> {
239 let start_time = Instant::now();
240 let download_timeout = Duration::from_secs(self.content_limits.max_download_time);
241
242 let response = timeout(download_timeout, self.client.get(url).send())
244 .await
245 .map_err(|_| PreviewError::DownloadTimeExceeded {
246 elapsed: start_time.elapsed().as_secs(),
247 limit: self.content_limits.max_download_time,
248 })?
249 .map_err(|e| {
250 #[cfg(feature = "logging")]
251 error!(error = %e, url = %url, "Failed to send request");
252 PreviewError::from_reqwest_error(e)
253 })?;
254
255 if response.status() == 404 {
257 return Err(PreviewError::NotFound(format!("Resource not found: {url}")));
258 }
259
260 if !response.status().is_success() {
261 let status = response.status().as_u16();
262 let message = format!("Server returned status: {}", response.status());
263
264 return Err(match status {
265 400..=499 => PreviewError::ClientError { status, message },
266 500..=599 => PreviewError::ServerError { status, message },
267 _ => PreviewError::HttpError { status, message },
268 });
269 }
270
271 if !self.content_limits.allowed_content_types.is_empty() {
273 if let Some(content_type) = response.headers().get("content-type") {
274 if let Ok(content_type_str) = content_type.to_str() {
275 let base_type = content_type_str.split(';').next().unwrap_or("").trim();
276 if !self
277 .content_limits
278 .allowed_content_types
279 .contains(base_type)
280 {
281 return Err(PreviewError::ContentTypeNotAllowed(base_type.to_string()));
282 }
283 }
284 }
285 }
286
287 if let Some(content_length) = response.headers().get("content-length") {
289 if let Ok(length_str) = content_length.to_str() {
290 if let Ok(length) = length_str.parse::<usize>() {
291 if length > self.content_limits.max_content_size {
292 return Err(PreviewError::ContentSizeExceeded {
293 size: length,
294 limit: self.content_limits.max_content_size,
295 });
296 }
297 }
298 }
299 }
300
301 let content = self.read_response_with_limit(response, start_time).await?;
303
304 #[cfg(feature = "logging")]
305 debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
306 Ok(content)
307 }
308
309 async fn read_response_with_limit(
310 &self,
311 response: Response,
312 start_time: Instant,
313 ) -> Result<String, PreviewError> {
314 let max_size = self.content_limits.max_content_size;
315 let max_time = Duration::from_secs(self.content_limits.max_download_time);
316
317 let bytes = tokio::time::timeout(
319 max_time.saturating_sub(start_time.elapsed()),
320 response.bytes(),
321 )
322 .await
323 .map_err(|_| PreviewError::DownloadTimeExceeded {
324 elapsed: start_time.elapsed().as_secs(),
325 limit: self.content_limits.max_download_time,
326 })?
327 .map_err(|e| {
328 #[cfg(feature = "logging")]
329 error!(error = %e, "Failed to read response body");
330 PreviewError::FetchError(e.to_string())
331 })?;
332
333 if bytes.len() > max_size {
335 return Err(PreviewError::ContentSizeExceeded {
336 size: bytes.len(),
337 limit: max_size,
338 });
339 }
340
341 String::from_utf8(bytes.to_vec()).map_err(|_e| {
343 #[cfg(feature = "logging")]
344 error!(error = %_e, "Response is not valid UTF-8");
345 PreviewError::FetchError("Invalid UTF-8 in response".to_string())
346 })
347 }
348
349 #[cfg(feature = "twitter")]
350 #[cfg_attr(feature = "logging", instrument(level = "debug", skip(self), err))]
351 async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
352 let oembed_url =
353 format!("https://publish.twitter.com/oembed?url={tweet_url}&omit_script=1&lang=en");
354
355 #[cfg(feature = "logging")]
356 debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
357
358 let response = self.client.get(&oembed_url).send().await.map_err(|e| {
359 #[cfg(feature = "logging")]
360 error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
361 let inner_error = PreviewError::from_reqwest_error(e);
363 match inner_error {
364 PreviewError::DnsError(msg) => PreviewError::ExternalServiceError {
365 service: "Twitter".to_string(),
366 message: format!("DNS error: {msg}"),
367 },
368 PreviewError::TimeoutError(msg) => PreviewError::ExternalServiceError {
369 service: "Twitter".to_string(),
370 message: format!("Timeout: {msg}"),
371 },
372 PreviewError::ConnectionError(msg) => PreviewError::ExternalServiceError {
373 service: "Twitter".to_string(),
374 message: format!("Connection error: {msg}"),
375 },
376 _ => PreviewError::ExternalServiceError {
377 service: "Twitter".to_string(),
378 message: inner_error.to_string(),
379 },
380 }
381 })?;
382
383 if response.status() == 404 {
385 return Err(PreviewError::NotFound(format!(
386 "Twitter/X content not found: {tweet_url}"
387 )));
388 }
389
390 if !response.status().is_success() {
391 let status = response.status().as_u16();
392 let message = format!("Twitter API returned status: {}", response.status());
393
394 return Err(PreviewError::ExternalServiceError {
396 service: "Twitter".to_string(),
397 message: match status {
398 400..=499 => format!("Client error ({status}): {message}"),
399 500..=599 => format!("Server error ({status}): {message}"),
400 _ => format!("HTTP error ({status}): {message}"),
401 },
402 });
403 }
404
405 let oembed: OEmbedResponse = response.json().await.map_err(|e| {
406 #[cfg(feature = "logging")]
407 error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
408 PreviewError::ExternalServiceError {
409 service: "Twitter".to_string(),
410 message: e.to_string(),
411 }
412 })?;
413
414 #[cfg(feature = "logging")]
415 debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
416 Ok(oembed)
417 }
418}
419
420#[cfg(feature = "twitter")]
422impl Fetcher {
423 #[cfg_attr(feature = "logging", instrument(level = "debug"))]
424 pub fn new_twitter_client() -> Self {
425 #[cfg(feature = "logging")]
426 debug!("Creating Twitter-specific fetcher");
427
428 let mut headers = HeaderMap::new();
429
430 headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
431 headers.insert(
432 "Accept",
433 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
434 .parse()
435 .unwrap(),
436 );
437
438 headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
439 headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
440 headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
441 headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
442 headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
443
444 headers.insert("Cache-Control", "no-cache".parse().unwrap());
445 headers.insert("Pragma", "no-cache".parse().unwrap());
446
447 let client = Client::builder()
448 .user_agent(
449 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
450 AppleWebKit/537.36 (KHTML, like Gecko) \
451 Chrome/119.0.0.0 Safari/537.36",
452 )
453 .timeout(Duration::from_secs(30))
454 .redirect(reqwest::redirect::Policy::limited(10))
455 .default_headers(headers)
456 .build()
457 .expect("Failed to create Twitter HTTP client");
458
459 #[cfg(feature = "logging")]
460 debug!("Twitter-specific fetcher created successfully");
461 Self {
462 client,
463 url_validator: UrlValidator::with_default_config(),
464 content_limits: ContentLimits::default(),
465 }
466 }
467
468 pub fn new_with_config(config: FetcherConfig) -> Self {
471 Self::with_config(config)
472 }
473}
474
475#[cfg(feature = "github")]
477impl Fetcher {
478 pub fn new_github_client() -> Self {
479 #[cfg(feature = "logging")]
480 debug!("Creating GitHub-specific client");
481
482 let mut headers = HeaderMap::new();
483 headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
484
485 if let Ok(token) = std::env::var("GITHUB_TOKEN") {
486 #[cfg(feature = "logging")]
487 debug!("Found GitHub token in environment");
488 headers.insert("Authorization", format!("Bearer {token}").parse().unwrap());
489 }
490
491 let client = Client::builder()
492 .user_agent("url_preview/1.0")
493 .default_headers(headers)
494 .timeout(Duration::from_secs(10))
495 .build()
496 .expect("Failed to create GitHub HTTP client");
497
498 Self {
499 client,
500 url_validator: UrlValidator::with_default_config(),
501 content_limits: ContentLimits::default(),
502 }
503 }
504
505 pub async fn fetch_github_repo(
506 &self,
507 owner: &str,
508 repo: &str,
509 ) -> Result<GitHubRepository, PreviewError> {
510 let url = format!("https://api.github.com/repos/{owner}/{repo}");
511 #[cfg(feature = "logging")]
512 debug!(url = %url, "Fetching GitHub repository information");
513
514 let response = self
515 .client
516 .get(&url)
517 .send()
518 .await
519 .map_err(PreviewError::from_reqwest_error)?;
520
521 if response.status() == 404 {
523 return Err(PreviewError::NotFound(format!(
524 "GitHub repository {owner}/{repo} not found"
525 )));
526 }
527
528 if !response.status().is_success() {
529 let status = response.status().as_u16();
530 let message = format!("API returned status: {}", response.status());
531
532 return Err(match status {
533 400..=499 => PreviewError::ClientError { status, message },
534 500..=599 => PreviewError::ServerError { status, message },
535 _ => PreviewError::HttpError { status, message },
536 });
537 }
538
539 let repo_info: GitHubRepository = response
540 .json()
541 .await
542 .map_err(|e| PreviewError::ParseError(e.to_string()))?;
543
544 Ok(repo_info)
545 }
546
547 pub fn parse_github_url(url: &str) -> Option<(String, String)> {
552 let parts: Vec<&str> = url
553 .trim_start_matches("https://")
554 .trim_start_matches("github.com/")
555 .split('/')
556 .collect();
557
558 if parts.len() >= 2 {
559 return Some((parts[0].to_string(), parts[1].to_string()));
560 }
561
562 None
563 }
564
565 fn extract_og_image(html: &str) -> Option<String> {
567 let document = Html::parse_document(html);
568 let selector = Selector::parse("meta[property='og:image']").ok()?;
569
570 document
571 .select(&selector)
572 .next()
573 .and_then(|elem| elem.value().attr("content"))
574 .map(|s| s.to_string())
575 }
576
577 pub async fn fetch_github_basic_preview(
579 &self,
580 owner: &str,
581 repo: &str,
582 ) -> Result<GitHubBasicPreview, PreviewError> {
583 let url = format!("https://github.com/{owner}/{repo}");
584 #[cfg(feature = "logging")]
585 debug!("Fetching basic preview for repository: {}/{}", owner, repo);
586
587 let response = self
588 .client
589 .get(&url)
590 .send()
591 .await
592 .map_err(PreviewError::from_reqwest_error)?;
593
594 if response.status() == 404 {
596 return Err(PreviewError::NotFound(format!(
597 "GitHub repository {owner}/{repo} not found"
598 )));
599 }
600
601 if !response.status().is_success() {
602 return Err(PreviewError::FetchError(format!(
603 "GitHub returned status: {}",
604 response.status()
605 )));
606 }
607
608 let html = response
609 .text()
610 .await
611 .map_err(|e| PreviewError::FetchError(e.to_string()))?;
612
613 let document = Html::parse_document(&html);
614
615 let title = Self::extract_meta_content(&document, "meta[property='og:title']");
617 let description = Self::extract_meta_content(&document, "meta[property='og:description']");
618 let image_url = Self::extract_og_image(&html);
619
620 #[cfg(feature = "logging")]
621 {
622 if let Some(ref url) = image_url {
623 debug!("Found GitHub Reop Preview Image URL: {}", url);
624 } else {
625 warn!("Not Found GitHub Reop Preview Image URL");
626 }
627 }
628
629 Ok(GitHubBasicPreview {
630 title,
631 description,
632 image_url,
633 })
634 }
635
636 pub async fn fetch_github_detailed_info(
638 &self,
639 owner: &str,
640 repo: &str,
641 ) -> Result<GitHubDetailedInfo, PreviewError> {
642 let api_url = format!("https://api.github.com/repos/{owner}/{repo}");
643 #[cfg(feature = "logging")]
644 debug!("Fetching detailed info from GitHub API: {}", api_url);
645
646 let response = self
647 .client
648 .get(&api_url)
649 .send()
650 .await
651 .map_err(PreviewError::from_reqwest_error)?;
652
653 if response.status() == 404 {
655 return Err(PreviewError::NotFound(format!(
656 "GitHub repository {owner}/{repo} not found"
657 )));
658 }
659
660 if !response.status().is_success() {
661 let status = response.status().as_u16();
662 let message = format!("API returned status: {}", response.status());
663
664 return Err(match status {
665 400..=499 => PreviewError::ClientError { status, message },
666 500..=599 => PreviewError::ServerError { status, message },
667 _ => PreviewError::HttpError { status, message },
668 });
669 }
670
671 let data: serde_json::Value = response
672 .json()
673 .await
674 .map_err(|e| PreviewError::ParseError(e.to_string()))?;
675
676 Ok(GitHubDetailedInfo {
677 full_name: data["full_name"].as_str().unwrap_or("").to_string(),
678 description: data["description"]
679 .as_str()
680 .map(|s| s.to_string())
681 .unwrap_or_default(),
682 stars_count: data["stargazers_count"].as_u64().unwrap_or(0) as u32,
683 forks_count: data["forks_count"].as_u64().unwrap_or(0) as u32,
684 open_issues_count: data["open_issues_count"].as_u64().unwrap_or(0) as u32,
685 language: data["language"].as_str().map(|s| s.to_string()),
686 default_branch: data["default_branch"]
687 .as_str()
688 .unwrap_or("main")
689 .to_string(),
690 topics: data["topics"]
691 .as_array()
692 .map(|arr| {
693 arr.iter()
694 .filter_map(|v| v.as_str().map(|s| s.to_string()))
695 .collect()
696 })
697 .unwrap_or_default(),
698 html_url: data["html_url"].as_str().unwrap_or(&api_url).to_string(),
699 homepage: data["homepage"]
700 .as_str()
701 .filter(|s| !s.is_empty())
702 .map(|s| s.to_string()),
703 })
704 }
705
706 fn extract_meta_content(document: &Html, selector_str: &str) -> Option<String> {
707 let selector = Selector::parse(selector_str).ok()?;
708 document
709 .select(&selector)
710 .next()
711 .and_then(|elem| elem.value().attr("content"))
712 .map(|s| s.to_string())
713 }
714}
715
716impl Fetcher {
718 pub fn extract_twitter_image_from_html(html: &str) -> Option<String> {
719 let document = Html::parse_document(html);
720 let selector = Selector::parse("meta[name='twitter:image']").ok()?;
721
722 if let Some(url) = document
723 .select(&selector)
724 .next()
725 .and_then(|elem| elem.value().attr("content"))
726 {
727 #[cfg(feature = "logging")]
728 debug!("Found Twitter image URL: {}", url);
729 return Some(url.to_string());
730 }
731
732 let og_selector = Selector::parse("meta[property='og:image']").ok()?;
733 document
734 .select(&og_selector)
735 .next()
736 .and_then(|elem| elem.value().attr("content"))
737 .map(|url| {
738 #[cfg(feature = "logging")]
739 debug!("Found Open Graph image URL: {}", url);
740 url.to_string()
741 })
742 }
743}