1use super::is_twitter_url;
2use crate::github_types::{GitHubBasicPreview, GitHubDetailedInfo, GitHubRepository};
3use crate::PreviewError;
4use reqwest::{header::HeaderMap, Client};
5use scraper::{Html, Selector};
6use serde::Deserialize;
7use std::time::Duration;
8use tracing::{debug, error, instrument, warn};
9
10#[derive(Debug, Clone, Deserialize)]
11pub struct OEmbedResponse {
12 pub html: String,
13 #[serde(default)]
14 pub author_name: String,
15 #[serde(default)]
16 pub author_url: String,
17 pub provider_name: String,
18 pub provider_url: String,
19}
20
21#[derive(Clone)]
22pub struct Fetcher {
23 client: Client,
24}
25
26#[derive(Debug, Clone)]
27pub enum FetchResult {
28 Html(String),
29 OEmbed(OEmbedResponse),
30}
31
32impl Fetcher {
33 pub fn new() -> Self {
34 let user_agent = "url_preview/0.1.0";
35 let timeout = Duration::from_secs(10);
36 debug!("Fetcher initialized with default configuration");
37
38 Self::new_with_custom_config(timeout, user_agent)
39 }
40
41 pub fn new_with_custom_config(timeout: Duration, user_agent: &str) -> Self {
42 let client = Client::builder()
43 .timeout(timeout)
44 .user_agent(user_agent)
45 .pool_max_idle_per_host(10)
46 .build()
47 .unwrap_or_else(|e| {
48 error!(error = %e, "Failed to create HTTP client");
49 panic!("Failed to initialize HTTP client: {}", e);
50 });
51 Fetcher { client }
52 }
53
54 pub fn with_client(client: Client) -> Self {
55 Self { client }
56 }
57
58 pub async fn fetch_batch(&self, urls: Vec<&str>) -> Result<Vec<FetchResult>, PreviewError> {
59 let futures: Vec<_> = urls.into_iter().map(|url| self.fetch(url)).collect();
60 let results = futures::future::join_all(futures).await;
61
62 let mut responses = Vec::new();
63 for result in results {
64 match result {
65 Ok(response) => responses.push(response),
66 Err(e) => return Err(e),
67 }
68 }
69
70 Ok(responses)
71 }
72
73
74 #[instrument(level = "debug", skip(self), err)]
75 pub async fn fetch_with_backoff(&self, url: &str) -> Result<String, PreviewError> {
76 let max_retries = 3;
77 let mut delay = Duration::from_millis(1000);
78
79 for attempt in 0..max_retries {
80 debug!(attempt = attempt + 1, "Attempting to fetch URL");
81
82 match self.client.get(url).send().await {
83 Ok(response) => {
84 if response.status().is_success() {
85 debug!(url = %url, "Successfully fetched URL");
86 return response.text().await.map_err(|e| {
87 error!(error = %e, "Failed to read response body");
88 PreviewError::FetchError(e.to_string())
89 });
90 }
91
92 if attempt < max_retries - 1 {
93 warn!(
94 status = %response.status(),
95 attempt = attempt + 1,
96 "Request failed, retrying after delay"
97 );
98 tokio::time::sleep(delay).await;
99 delay *= 2;
100 continue;
101 }
102 }
103 Err(e) => {
104 if attempt < max_retries - 1 {
105 warn!(
106 error = %e,
107 attempt = attempt + 1,
108 "Request error, retrying after delay"
109 );
110 tokio::time::sleep(delay).await;
111 delay *= 2;
112 continue;
113 }
114 error!(error = %e, "Max retries exceeded");
115 return Err(PreviewError::FetchError(e.to_string()));
116 }
117 }
118 }
119
120 error!("Failed to fetch URL after maximum retries");
121 Err(PreviewError::FetchError("Max retries exceeded".to_string()))
122 }
123
124 #[instrument(level = "debug", skip(self), err)]
125 pub async fn fetch(&self, url: &str) -> Result<FetchResult, PreviewError> {
126 debug!(url = %url, "Starting fetch request");
127
128 if is_twitter_url(url) {
129 debug!(url = %url, "Detected Twitter URL, using oEmbed API");
130 let oembed = self.fetch_twitter_oembed(url).await?;
131 Ok(FetchResult::OEmbed(oembed))
132 } else {
133 debug!(url = %url, "Fetching regular webpage");
134 let content = self
135 .client
136 .get(url)
137 .send()
138 .await
139 .map_err(|e| {
140 error!(error = %e, url = %url, "Failed to send request");
141 PreviewError::FetchError(e.to_string())
142 })?
143 .text()
144 .await
145 .map_err(|e| {
146 error!(error = %e, url = %url, "Failed to read response body");
147 PreviewError::FetchError(e.to_string())
148 })?;
149
150 debug!(url = %url, content_length = content.len(), "Successfully fetched webpage");
151 Ok(FetchResult::Html(content))
152 }
153 }
154
155 #[instrument(level = "debug", skip(self), err)]
156 async fn fetch_twitter_oembed(&self, tweet_url: &str) -> Result<OEmbedResponse, PreviewError> {
157 let oembed_url = format!(
158 "https://publish.twitter.com/oembed?url={}&omit_script=1&lang=en",
159 tweet_url
160 );
161
162 debug!(tweet_url = %tweet_url, "Fetching Twitter oEmbed data");
163
164 let response = self.client.get(&oembed_url).send().await.map_err(|e| {
165 error!(error = %e, url = %tweet_url, "Failed to fetch Twitter oEmbed");
166 PreviewError::ExternalServiceError {
167 service: "Twitter".to_string(),
168 message: e.to_string(),
169 }
170 })?;
171
172 let oembed: OEmbedResponse = response.json().await.map_err(|e| {
173 error!(error = %e, url = %tweet_url, "Failed to parse Twitter oEmbed response");
174 PreviewError::ExternalServiceError {
175 service: "Twitter".to_string(),
176 message: e.to_string(),
177 }
178 })?;
179
180 debug!(tweet_url = %tweet_url, "Successfully fetched Twitter oEmbed data");
181 Ok(oembed)
182 }
183}
184
185impl Fetcher {
187
188 #[instrument(level = "debug")]
189 pub fn new_twitter_client() -> Self {
190 debug!("Creating Twitter-specific fetcher");
191
192 let mut headers = HeaderMap::new();
193
194 headers.insert("Accept-Language", "en-US,en;q=0.9".parse().unwrap());
195 headers.insert(
196 "Accept",
197 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
198 .parse()
199 .unwrap(),
200 );
201
202 headers.insert("Sec-Fetch-Dest", "document".parse().unwrap());
203 headers.insert("Sec-Fetch-Mode", "navigate".parse().unwrap());
204 headers.insert("Sec-Fetch-Site", "none".parse().unwrap());
205 headers.insert("Sec-Fetch-User", "?1".parse().unwrap());
206 headers.insert("Upgrade-Insecure-Requests", "1".parse().unwrap());
207
208 headers.insert("Cache-Control", "no-cache".parse().unwrap());
209 headers.insert("Pragma", "no-cache".parse().unwrap());
210
211 let client = Client::builder()
212 .user_agent(
213 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
214 AppleWebKit/537.36 (KHTML, like Gecko) \
215 Chrome/119.0.0.0 Safari/537.36",
216 )
217 .timeout(Duration::from_secs(30))
218 .redirect(reqwest::redirect::Policy::limited(10))
219 .default_headers(headers)
220 .build()
221 .expect("Failed to create Twitter HTTP client");
222
223 debug!("Twitter-specific fetcher created successfully");
224 Self { client }
225 }
226
227 pub fn new_with_config(config: FetcherConfig) -> Self {
230 let mut client_builder = Client::builder()
231 .user_agent(config.user_agent)
232 .timeout(config.timeout);
233
234 if let Some(headers) = config.headers {
236 client_builder = client_builder.default_headers(headers);
237 }
238
239 if let Some(redirect_policy) = config.redirect_policy {
241 client_builder = client_builder.redirect(redirect_policy);
242 }
243
244 let client = client_builder
245 .build()
246 .expect("Failed to create HTTP client with custom config");
247
248 Self { client }
249 }
250}
251
252impl Fetcher {
254
255 pub fn new_github_client() -> Self {
256 debug!("Creating GitHub-specific client");
257
258 let mut headers = HeaderMap::new();
259 headers.insert("Accept", "application/vnd.github.v3+json".parse().unwrap());
260
261 if let Ok(token) = std::env::var("GITHUB_TOKEN") {
262 debug!("Found GitHub token in environment");
263 headers.insert(
264 "Authorization",
265 format!("Bearer {}", token).parse().unwrap(),
266 );
267 }
268
269 let client = Client::builder()
270 .user_agent("url_preview/1.0")
271 .default_headers(headers)
272 .timeout(Duration::from_secs(10))
273 .build()
274 .unwrap();
275
276 Self { client }
277 }
278
279 pub async fn fetch_github_repo(
280 &self,
281 owner: &str,
282 repo: &str,
283 ) -> Result<GitHubRepository, PreviewError> {
284 let url = format!("https://api.github.com/repos/{}/{}", owner, repo);
285 debug!(url = %url, "Fetching GitHub repository information");
286
287 let response =
288 self.client.get(&url).send().await.map_err(|e| {
289 PreviewError::FetchError(format!("GitHub API request failed: {}", e))
290 })?;
291
292 if !response.status().is_success() {
293 return Err(PreviewError::FetchError(format!(
294 "GitHub API returned status: {}",
295 response.status()
296 )));
297 }
298
299 response.json::<GitHubRepository>().await.map_err(|e| {
300 PreviewError::ExtractError(format!("Failed to parse GitHub response: {}", e))
301 })
302 }
303}
304
305pub struct FetcherConfig {
323 pub user_agent: String,
324 pub timeout: Duration,
325 pub headers: Option<HeaderMap>,
326 pub redirect_policy: Option<reqwest::redirect::Policy>,
327}
328
329impl Default for FetcherConfig {
330 fn default() -> Self {
331 Self {
332 user_agent: "url_preview/0.1.0".to_string(),
333 timeout: Duration::from_secs(10),
334 headers: None,
335 redirect_policy: None,
336 }
337 }
338}
339
340impl Fetcher {
342 pub async fn fetch_github_basic_preview(
343 &self,
344 owner: &str,
345 repo: &str,
346 ) -> Result<GitHubBasicPreview, PreviewError> {
347 let url = format!("https://github.com/{}/{}", owner, repo);
348 debug!("Fetching basic preview for repository: {}/{}", owner, repo);
349
350 let response =
351 self.client.get(&url).send().await.map_err(|e| {
352 PreviewError::FetchError(format!("Failed to fetch GitHub page: {}", e))
353 })?;
354
355 let html = response.text().await.map_err(|e| {
356 PreviewError::FetchError(format!("Failed to read response body: {}", e))
357 })?;
358
359 let document = Html::parse_document(&html);
360
361 let title = self.extract_title(&document)?;
362 let description = self.extract_description(&document);
363 let image_url = self.extract_og_image(&document);
364
365 if let Some(ref url) = image_url {
366 debug!("Found GitHub Reop Preview Image URL: {}", url);
367 } else {
368 warn!("Not Found GitHub Reop Preview Image URL");
369 }
370
371 Ok(GitHubBasicPreview {
372 title,
373 description,
374 image_url,
375 })
376 }
377
378 pub async fn fetch_github_detailed_info(
379 &self,
380 owner: &str,
381 repo: &str,
382 ) -> Result<GitHubDetailedInfo, PreviewError> {
383 let api_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
384 debug!("Fetching detailed info from GitHub API: {}", api_url);
385
386 let response = self
387 .client
388 .get(&api_url)
389 .header("Accept", "application/vnd.github.v3+json")
390 .send()
391 .await
392 .map_err(|e| PreviewError::FetchError(format!("GitHub API request failed: {}", e)))?;
393
394 let repo_data: serde_json::Value = response.json().await.map_err(|e| {
395 PreviewError::ExtractError(format!("Failed to parse GitHub API response: {}", e))
396 })?;
397
398 let contributors_url = format!("{}/contributors?per_page=1", api_url);
399 let contributors_count = self.get_contributors_count(&contributors_url).await?;
400
401 Ok(GitHubDetailedInfo {
402 stars_count: repo_data["stargazers_count"].as_u64().unwrap_or(0) as u32,
403 forks_count: repo_data["forks_count"].as_u64().unwrap_or(0) as u32,
404 contributors_count,
405 issues_count: repo_data["open_issues_count"].as_u64().unwrap_or(0) as u32,
406 discussions_count: repo_data["discussions_count"].as_u64().unwrap_or(0) as u32,
407 primary_language: repo_data["language"].as_str().map(String::from),
408 })
409 }
410
411 fn extract_title(&self, document: &Html) -> Result<String, PreviewError> {
412 let og_title_selector = Selector::parse("meta[property='og:title']")
413 .map_err(|e| PreviewError::ExtractError(format!("Invalid selector: {}", e)))?;
414
415 document
416 .select(&og_title_selector)
417 .next()
418 .and_then(|el| el.value().attr("content"))
419 .map(String::from)
420 .ok_or_else(|| PreviewError::ExtractError("Title not found".into()))
421 }
422
423 fn extract_description(&self, document: &Html) -> Option<String> {
424 let selector = Selector::parse("meta[property='og:description']").ok()?;
425 document
426 .select(&selector)
427 .next()
428 .and_then(|el| el.value().attr("content"))
429 .map(String::from)
430 }
431
432 fn extract_og_image(&self, document: &Html) -> Option<String> {
433 let twitter_image_selector = Selector::parse("meta[name='twitter:image']").ok()?;
434
435 if let Some(url) = document
436 .select(&twitter_image_selector)
437 .next()
438 .and_then(|el| el.value().attr("content"))
439 {
440 debug!("Found Twitter image URL: {}", url);
441 return Some(url.to_string());
442 }
443
444 let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
446
447 document
448 .select(&og_image_selector)
449 .next()
450 .and_then(|el| el.value().attr("content"))
451 .map(|url| {
452 debug!("Found Open Graph image URL: {}", url);
453 url.to_string()
454 })
455 }
456
457 async fn get_contributors_count(&self, url: &str) -> Result<u32, PreviewError> {
458 let response = self.client.get(url).send().await.map_err(|e| {
459 PreviewError::FetchError(format!("Failed to fetch contributors: {}", e))
460 })?;
461
462 if let Some(link_header) = response.headers().get("Link") {
463 if let Ok(link_str) = link_header.to_str() {
464 if let Some(last_page) = parse_github_link_header(link_str) {
465 return Ok(last_page);
466 }
467 }
468 }
469
470 Ok(1)
471 }
472}
473
474fn parse_github_link_header(link_str: &str) -> Option<u32> {
475 for link in link_str.split(',') {
480 if link.contains("rel=\"last\"") {
481 if let Some(page) = link
482 .split(';')
483 .next()
484 .and_then(|url| Some(url.trim_matches(|c| c == '<' || c == '>' || c == ' ')))
485 .and_then(|url| url.split('=').last())
486 .and_then(|page| page.parse().ok())
487 {
488 return Some(page);
489 }
490 }
491 }
492 None
493}