1use crate::http_regex::URL_REGEX;
2use crate::{http_config::HttpConfig, user_agent::UserAgent};
3use futures::future;
4use reqwest::{Client, blocking};
5use std::future::Future;
6use std::time::Duration;
7
8#[derive(Default)]
13pub struct HttpClient {}
14
15impl HttpClient {
16 pub fn new() -> Self {
18 Self {}
19 }
20
21 pub fn fetch_content_from_text(
32 &self,
33 text: &str,
34 http_config: HttpConfig,
35 ) -> Vec<(String, String)> {
36 let urls = self.extract_urls(text);
37 if urls.is_empty() {
38 return Vec::new();
39 }
40 self.fetch_content_from_urls(urls, http_config)
41 }
42
43 pub async fn fetch_content_from_text_async<F, Fut>(
44 &self,
45 text: &str,
46 http_config: HttpConfig,
47 future: F,
48 ) -> Result<(), Box<dyn std::error::Error>>
49 where
50 F: Fn(Option<String>, Option<String>) -> Fut + Clone,
51 Fut: Future<Output = ()>,
52 {
53 let urls = self.extract_urls(text);
54 if urls.is_empty() {
55 future(None, None).await;
56 return Ok(());
57 }
58
59 self.fetch_content_from_urls_async(urls, http_config, future)
60 .await?;
61
62 Ok(())
63 }
64
65 fn extract_urls(&self, text: &str) -> Vec<String> {
66 URL_REGEX
67 .find_iter(text)
68 .map(|m| clean_url(m.as_str()))
69 .collect()
70 }
71
72 fn fetch_content_from_urls(
74 &self,
75 urls: Vec<String>,
76 http_config: HttpConfig,
77 ) -> Vec<(String, String)> {
78 handles_http_requests_results(urls, http_config)
79 }
80
81 async fn fetch_content_from_urls_async<F, Fut>(
82 &self,
83 urls: Vec<String>,
84 http_config: HttpConfig,
85 future: F,
86 ) -> Result<(), Box<dyn std::error::Error>>
87 where
88 F: Fn(Option<String>, Option<String>) -> Fut + Clone,
89 Fut: Future<Output = ()>,
90 {
91 handles_http_requests_results_async(urls, http_config, future).await?;
92 Ok(())
93 }
94}
95
96fn handles_http_requests_results(
97 urls: Vec<String>,
98 http_config: HttpConfig,
99) -> Vec<(String, String)> {
100 let client = build_client(http_config);
101 let mut results = Vec::new();
102 let user_agent = UserAgent::random();
103
104 for url in &urls {
105 match client
106 .get(url)
107 .header("User-Agent", user_agent.to_string())
108 .header(
109 "Accept",
110 "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
111 )
112 .header("Accept-Language", "en-US,en;q=0.5")
113 .header("DNT", "1")
114 .header("Connection", "keep-alive")
115 .header("Upgrade-Insecure-Requests", "1")
116 .header("Sec-Fetch-Dest", "document")
117 .header("Sec-Fetch-Mode", "navigate")
118 .header("Sec-Fetch-Site", "none")
119 .header("Sec-Fetch-User", "?1")
120 .header("js_timeout", "2000")
121 .header("Cache-Control", "no-cache")
122 .header("js", "true")
123 .send()
124 {
125 Ok(response) => match response.text() {
126 Ok(html_content) => {
127 results.push((url.to_string(), html_content));
128 }
129 Err(e) => {
130 eprintln!("Error reading content from {}: {}", url, e);
131 }
132 },
133 Err(e) => {
134 eprintln!("Error accessing {}: {}", url, e);
135 }
136 }
137 }
138 results
139}
140
141async fn handles_http_requests_results_async<F, Fut>(
142 urls: Vec<String>,
143 http_config: HttpConfig,
144 future: F,
145) -> Result<(), Box<dyn std::error::Error>>
146where
147 F: Fn(Option<String>, Option<String>) -> Fut + Clone,
148 Fut: Future<Output = ()>,
149{
150 let client = build_client_async(http_config);
151 let user_agent = UserAgent::random();
152
153 let requests = urls.into_iter().map(|url| {
154 let client = client.clone();
155 let future = future.clone();
156
157 async move {
158 match client
159 .get(&url)
160 .header("User-Agent", user_agent.to_string())
161 .header(
162 "Accept",
163 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
164 )
165 .header("Accept-Language", "en-US,en;q=0.5")
166 .header("DNT", "1")
167 .header("Connection", "keep-alive")
168 .header("Upgrade-Insecure-Requests", "1")
169 .header("Sec-Fetch-Dest", "document")
170 .header("Sec-Fetch-Mode", "navigate")
171 .header("Sec-Fetch-Site", "none")
172 .header("Sec-Fetch-User", "?1")
173 .header("js_timeout", "2000")
174 .header("js", "true")
175 .send()
176 .await
177 {
178 Ok(response) => {
179 let body = response.text().await.unwrap_or_default();
180 future(Some(url.to_string()), Some(body)).await
181 }
182 Err(e) => future(Some(url.to_string()), Some(format!("Error: {}", e))).await,
183 }
184 }
185 });
186
187 future::join_all(requests).await;
188
189 Ok(())
190}
191
192fn build_client(http_config: HttpConfig) -> blocking::Client {
193 match http_config.timeout() {
194 Some(timeout) => blocking::Client::builder()
195 .timeout(Duration::from_millis(timeout))
196 .redirect(reqwest::redirect::Policy::limited(
197 http_config.max_redirect().unwrap_or(2),
198 ))
199 .cookie_store(http_config.cookie_store())
200 .build()
201 .unwrap_or_else(|_| blocking::Client::new()),
202 None => blocking::Client::new(),
203 }
204}
205
206fn build_client_async(http_config: HttpConfig) -> Client {
207 match http_config.timeout() {
208 Some(timeout) => Client::builder()
209 .timeout(Duration::from_millis(timeout))
210 .redirect(reqwest::redirect::Policy::limited(
211 http_config.max_redirect().unwrap_or(2),
212 ))
213 .cookie_store(http_config.cookie_store())
214 .build()
215 .unwrap_or_else(|_| Client::new()),
216 None => Client::new(),
217 }
218}
219
220fn clean_url(url: &str) -> String {
221 let mut result = url.to_string();
222
223 let open_parens = url.chars().filter(|&c| c == '(').count();
225 let close_parens = url.chars().filter(|&c| c == ')').count();
226
227 if open_parens == close_parens {
229 result = result
230 .trim_end_matches(&['.', ',', ';', '!', '?', ']', '}'][..])
231 .to_string();
232 } else {
233 result = result
234 .trim_end_matches(&['.', ',', ';', '!', '?', ')', ']', '}'][..])
235 .to_string();
236 }
237
238 result
239}
240
241#[cfg(test)]
242mod tests {
243 use crate::http_config::HttpConfigBuilder;
244 use std::sync::{Arc, Mutex};
245 use tokio;
246
247 use super::*;
248
249 #[test]
250 fn test_new() {
251 let client = HttpClient::new();
252 assert_eq!(std::mem::size_of_val(&client), 0);
253 }
254
255 #[test]
256 fn test_extract_urls() {
257 let client = HttpClient::new();
258
259 let text = "Check out https://example.com and https://test.org for more info";
260 let urls = client.extract_urls(text);
261 assert_eq!(urls.len(), 2);
262 assert!(urls.contains(&"https://example.com".to_string()));
263 assert!(urls.contains(&"https://test.org".to_string()));
264
265 let text = "This text has no URLs";
266 let urls = client.extract_urls(text);
267 assert_eq!(urls.len(), 0);
268 }
269
270 #[test]
271 fn test_extract_urls_with_query_strings() {
272 let client = HttpClient::new();
273
274 let text = "Cavafy lived in England for much of his adolescence, and developed both a command of the English language and a preference for the writings of William Shakespeare http://www.poetryfoundation.org/archive/poet.html?id=6176 and Oscar Wilde http://www.poetryfoundation.org/archive/poet.html?id=7425. Cavafy's older brothers mismanaged the family business in Liverpool, and Cavafy's mother was ultimately compelled to move the family back to Alexandria, where they lived until 1882.";
276 let urls = client.extract_urls(text);
277 assert_eq!(urls.len(), 2);
278 assert!(
279 urls.contains(&"http://www.poetryfoundation.org/archive/poet.html?id=6176".to_string())
280 );
281 assert!(
282 urls.contains(&"http://www.poetryfoundation.org/archive/poet.html?id=7425".to_string())
283 );
284
285 let text = "Rust is a general-purpose https://en.wikipedia.org/wiki/General-purpose_programming_language programming language https://en.wikipedia.org/wiki/Programming_language emphasizing performance https://en.wikipedia.org/wiki/Computer_performance, type safety https://en.wikipedia.org/wiki/Type_safety, and concurrency https://en.wikipedia.org/wiki/Concurrency_(computer_science). It enforces memory safety https://en.wikipedia.org/wiki/Memory_safety, meaning that all references point to valid memory.";
287 let urls = client.extract_urls(text);
288 assert_eq!(urls.len(), 6);
289 assert!(urls.contains(
290 &"https://en.wikipedia.org/wiki/General-purpose_programming_language".to_string()
291 ));
292 assert!(urls.contains(&"https://en.wikipedia.org/wiki/Programming_language".to_string()));
293 assert!(urls.contains(&"https://en.wikipedia.org/wiki/Computer_performance".to_string()));
294 assert!(urls.contains(&"https://en.wikipedia.org/wiki/Type_safety".to_string()));
295 assert!(
296 urls.contains(
297 &"https://en.wikipedia.org/wiki/Concurrency_(computer_science)".to_string()
298 )
299 );
300 assert!(urls.contains(&"https://en.wikipedia.org/wiki/Memory_safety".to_string()));
301
302 let text = "A language empowering everyone https://www.rust-lang.org/ to build reliable and efficient software.";
304 let urls = client.extract_urls(text);
305 assert_eq!(urls.len(), 1);
306 assert!(urls.contains(&"https://www.rust-lang.org/".to_string()));
307 }
308
309 #[test]
310 fn test_clean_url() {
311 assert_eq!(clean_url("https://example.com."), "https://example.com");
312 assert_eq!(clean_url("https://example.com,"), "https://example.com");
313 assert_eq!(clean_url("https://example.com!"), "https://example.com");
314 assert_eq!(clean_url("https://example.com"), "https://example.com");
315
316 assert_eq!(
318 clean_url("https://en.wikipedia.org/wiki/Concurrency_(computer_science)"),
319 "https://en.wikipedia.org/wiki/Concurrency_(computer_science)"
320 );
321
322 assert_eq!(clean_url("https://example.com)"), "https://example.com");
324 }
325
326 #[test]
327 fn test_fetch_content_from_urls_empty() {
328 let client = HttpClient::new();
329 let urls: Vec<String> = vec![];
330 let results =
331 client.fetch_content_from_urls(urls, HttpConfigBuilder::new().timeout(30000).build());
332 assert_eq!(results.len(), 0);
333 }
334
335 #[test]
336 fn test_fetch_content_from_text_no_urls() {
337 let client = HttpClient::new();
338 let text = "This text has no URLs";
339 let results =
340 client.fetch_content_from_text(text, HttpConfigBuilder::new().timeout(30000).build());
341 assert_eq!(results.len(), 0);
342 }
343
344 #[tokio::test]
345 async fn test_fetch_content_from_text_async_no_urls() {
346 let client = HttpClient::new();
347 let text = "This text has no URLs";
348 let results = Arc::new(Mutex::new(Vec::new()));
349 let results_clone = results.clone();
350
351 let callback = move |url: Option<String>, content: Option<String>| {
352 let results = results_clone.clone();
353 async move {
354 let mut results = results.lock().unwrap();
355 results.push((url, content));
356 }
357 };
358
359 let result = client
360 .fetch_content_from_text_async(
361 text,
362 HttpConfigBuilder::new().timeout(30000).build(),
363 callback,
364 )
365 .await;
366
367 assert!(result.is_ok());
368 let results = results.lock().unwrap();
369 assert_eq!(results.len(), 1);
370 assert_eq!(results[0], (None, None));
371 }
372
373 #[tokio::test]
374 async fn test_fetch_content_from_text_async_with_urls() {
375 let client = HttpClient::new();
376 let text = "Check out https://httpbin.org/status/200 for testing";
377 let results = Arc::new(Mutex::new(Vec::new()));
378 let results_clone = results.clone();
379
380 let callback = move |url: Option<String>, content: Option<String>| {
381 let results = results_clone.clone();
382 async move {
383 let mut results = results.lock().unwrap();
384 results.push((url, content));
385 }
386 };
387
388 let result = client
389 .fetch_content_from_text_async(
390 text,
391 HttpConfigBuilder::new().timeout(30000).build(),
392 callback,
393 )
394 .await;
395
396 assert!(result.is_ok());
397 let results = results.lock().unwrap();
398 assert_eq!(results.len(), 1);
399 assert!(results[0].0.is_some());
400 assert!(results[0].1.is_some());
401 }
402
403 #[tokio::test]
404 async fn test_fetch_content_from_urls_async_empty() {
405 let client = HttpClient::new();
406 let urls: Vec<String> = vec![];
407 let results = Arc::new(Mutex::new(Vec::new()));
408 let results_clone = results.clone();
409
410 let callback = move |url: Option<String>, content: Option<String>| {
411 let results = results_clone.clone();
412 async move {
413 let mut results = results.lock().unwrap();
414 results.push((url, content));
415 }
416 };
417
418 let result = client
419 .fetch_content_from_urls_async(
420 urls,
421 HttpConfigBuilder::new().timeout(30000).build(),
422 callback,
423 )
424 .await;
425
426 assert!(result.is_ok());
427 let results = results.lock().unwrap();
428 assert_eq!(results.len(), 0);
429 }
430
431 #[tokio::test]
432 async fn test_fetch_content_from_urls_async_with_urls() {
433 let client = HttpClient::new();
434 let urls = vec!["https://httpbin.org/status/200".to_string()];
435 let results = Arc::new(Mutex::new(Vec::new()));
436 let results_clone = results.clone();
437
438 let callback = move |url: Option<String>, content: Option<String>| {
439 let results = results_clone.clone();
440 async move {
441 let mut results = results.lock().unwrap();
442 results.push((url, content));
443 }
444 };
445
446 let result = client
447 .fetch_content_from_urls_async(
448 urls,
449 HttpConfigBuilder::new().timeout(30000).build(),
450 callback,
451 )
452 .await;
453
454 assert!(result.is_ok());
455 let results = results.lock().unwrap();
456 assert_eq!(results.len(), 1);
457 assert!(results[0].0.is_some());
458 assert!(results[0].1.is_some());
459 }
460
461 #[tokio::test]
462 async fn test_handles_http_requests_results_async_empty() {
463 let urls: Vec<String> = vec![];
464 let results = Arc::new(Mutex::new(Vec::new()));
465 let results_clone = results.clone();
466
467 let callback = move |url: Option<String>, content: Option<String>| {
468 let results = results_clone.clone();
469 async move {
470 let mut results = results.lock().unwrap();
471 results.push((url, content));
472 }
473 };
474
475 let result = handles_http_requests_results_async(
476 urls,
477 HttpConfigBuilder::new().timeout(30000).build(),
478 callback,
479 )
480 .await;
481
482 assert!(result.is_ok());
483 let results = results.lock().unwrap();
484 assert_eq!(results.len(), 0);
485 }
486
487 #[tokio::test]
488 async fn test_handles_http_requests_results_async_with_urls() {
489 let urls = vec!["https://httpbin.org/status/200".to_string()];
490 let results = Arc::new(Mutex::new(Vec::new()));
491 let results_clone = results.clone();
492
493 let callback = move |url: Option<String>, content: Option<String>| {
494 let results = results_clone.clone();
495 async move {
496 let mut results = results.lock().unwrap();
497 results.push((url, content));
498 }
499 };
500
501 let result = handles_http_requests_results_async(
502 urls,
503 HttpConfigBuilder::new().timeout(30000).build(),
504 callback,
505 )
506 .await;
507
508 assert!(result.is_ok());
509 let results = results.lock().unwrap();
510 assert_eq!(results.len(), 1);
511 assert!(results[0].0.is_some());
512 assert!(results[0].1.is_some());
513 }
514
515 #[test]
516 fn test_build_client_async_with_timeout() {
517 let http_config = HttpConfigBuilder::new().timeout(5000).build();
518 let client = build_client_async(http_config);
519
520 assert_eq!(
522 std::mem::size_of_val(&client),
523 std::mem::size_of::<Client>()
524 );
525 }
526
527 #[test]
528 fn test_build_client_async_without_timeout() {
529 let http_config = HttpConfigBuilder::new().build();
530 let client = build_client_async(http_config);
531
532 assert_eq!(
534 std::mem::size_of_val(&client),
535 std::mem::size_of::<Client>()
536 );
537 }
538
539 #[test]
540 fn test_build_client_async_with_max_redirect() {
541 let http_config = HttpConfigBuilder::new()
542 .timeout(5000)
543 .max_redirect(5)
544 .build();
545 let client = build_client_async(http_config);
546
547 assert_eq!(
549 std::mem::size_of_val(&client),
550 std::mem::size_of::<Client>()
551 );
552 }
553}