markdown_harvest/
http_client.rs

1use crate::http_regex::URL_REGEX;
2use crate::{http_config::HttpConfig, user_agent::UserAgent};
3use futures::future;
4use reqwest::{Client, blocking};
5use std::future::Future;
6use std::time::Duration;
7
8/// Component responsible for handling HTTP requests and URL processing.
9///
10/// `HttpClient` encapsulates all HTTP-related functionality including URL extraction,
11/// URL cleaning, and content fetching. This component reuses the original functions
12#[derive(Default)]
13pub struct HttpClient {}
14
15impl HttpClient {
16    /// Creates a new HttpClient instance.
17    pub fn new() -> Self {
18        Self {}
19    }
20
21    /// Extracts URLs from text and fetches their content with custom HTTP configuration.
22    ///
23    /// # Arguments
24    ///
25    /// * `text` - Input text that may contain URLs
26    /// * `http_config` - HTTP configuration including timeout, retries, and other settings
27    ///
28    /// # Returns
29    ///
30    /// A vector of tuples containing (URL, HTML content)
31    pub fn fetch_content_from_text(
32        &self,
33        text: &str,
34        http_config: HttpConfig,
35    ) -> Vec<(String, String)> {
36        let urls = self.extract_urls(text);
37        if urls.is_empty() {
38            return Vec::new();
39        }
40        self.fetch_content_from_urls(urls, http_config)
41    }
42
43    pub async fn fetch_content_from_text_async<F, Fut>(
44        &self,
45        text: &str,
46        http_config: HttpConfig,
47        future: F,
48    ) -> Result<(), Box<dyn std::error::Error>>
49    where
50        F: Fn(Option<String>, Option<String>) -> Fut + Clone,
51        Fut: Future<Output = ()>,
52    {
53        let urls = self.extract_urls(text);
54        if urls.is_empty() {
55            future(None, None).await;
56            return Ok(());
57        }
58
59        self.fetch_content_from_urls_async(urls, http_config, future)
60            .await?;
61
62        Ok(())
63    }
64
65    fn extract_urls(&self, text: &str) -> Vec<String> {
66        URL_REGEX
67            .find_iter(text)
68            .map(|m| clean_url(m.as_str()))
69            .collect()
70    }
71
72    /// Fetches HTML content from a list of URLs with custom HTTP configuration.
73    fn fetch_content_from_urls(
74        &self,
75        urls: Vec<String>,
76        http_config: HttpConfig,
77    ) -> Vec<(String, String)> {
78        handles_http_requests_results(urls, http_config)
79    }
80
81    async fn fetch_content_from_urls_async<F, Fut>(
82        &self,
83        urls: Vec<String>,
84        http_config: HttpConfig,
85        future: F,
86    ) -> Result<(), Box<dyn std::error::Error>>
87    where
88        F: Fn(Option<String>, Option<String>) -> Fut + Clone,
89        Fut: Future<Output = ()>,
90    {
91        handles_http_requests_results_async(urls, http_config, future).await?;
92        Ok(())
93    }
94}
95
96fn handles_http_requests_results(
97    urls: Vec<String>,
98    http_config: HttpConfig,
99) -> Vec<(String, String)> {
100    let client = build_client(http_config);
101    let mut results = Vec::new();
102    let user_agent = UserAgent::random();
103
104    for url in &urls {
105        match client
106            .get(url)
107            .header("User-Agent", user_agent.to_string())
108            .header(
109                "Accept",
110                "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
111            )
112            .header("Accept-Language", "en-US,en;q=0.5")
113            .header("DNT", "1")
114            .header("Connection", "keep-alive")
115            .header("Upgrade-Insecure-Requests", "1")
116            .header("Sec-Fetch-Dest", "document")
117            .header("Sec-Fetch-Mode", "navigate")
118            .header("Sec-Fetch-Site", "none")
119            .header("Sec-Fetch-User", "?1")
120            .header("js_timeout", "2000")
121            .header("Cache-Control", "no-cache")
122            .header("js", "true")
123            .send()
124        {
125            Ok(response) => match response.text() {
126                Ok(html_content) => {
127                    results.push((url.to_string(), html_content));
128                }
129                Err(e) => {
130                    eprintln!("Error reading content from {}: {}", url, e);
131                }
132            },
133            Err(e) => {
134                eprintln!("Error accessing {}: {}", url, e);
135            }
136        }
137    }
138    results
139}
140
141async fn handles_http_requests_results_async<F, Fut>(
142    urls: Vec<String>,
143    http_config: HttpConfig,
144    future: F,
145) -> Result<(), Box<dyn std::error::Error>>
146where
147    F: Fn(Option<String>, Option<String>) -> Fut + Clone,
148    Fut: Future<Output = ()>,
149{
150    let client = build_client_async(http_config);
151    let user_agent = UserAgent::random();
152
153    let requests = urls.into_iter().map(|url| {
154        let client = client.clone();
155        let future = future.clone();
156
157        async move {
158            match client
159                .get(&url)
160                .header("User-Agent", user_agent.to_string())
161                .header(
162                    "Accept",
163                    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
164                )
165                .header("Accept-Language", "en-US,en;q=0.5")
166                .header("DNT", "1")
167                .header("Connection", "keep-alive")
168                .header("Upgrade-Insecure-Requests", "1")
169                .header("Sec-Fetch-Dest", "document")
170                .header("Sec-Fetch-Mode", "navigate")
171                .header("Sec-Fetch-Site", "none")
172                .header("Sec-Fetch-User", "?1")
173                .header("js_timeout", "2000")
174                .header("js", "true")
175                .send()
176                .await
177            {
178                Ok(response) => {
179                    let body = response.text().await.unwrap_or_default();
180                    future(Some(url.to_string()), Some(body)).await
181                }
182                Err(e) => future(Some(url.to_string()), Some(format!("Error: {}", e))).await,
183            }
184        }
185    });
186
187    future::join_all(requests).await;
188
189    Ok(())
190}
191
192fn build_client(http_config: HttpConfig) -> blocking::Client {
193    match http_config.timeout() {
194        Some(timeout) => blocking::Client::builder()
195            .timeout(Duration::from_millis(timeout))
196            .redirect(reqwest::redirect::Policy::limited(
197                http_config.max_redirect().unwrap_or(2),
198            ))
199            .cookie_store(http_config.cookie_store())
200            .build()
201            .unwrap_or_else(|_| blocking::Client::new()),
202        None => blocking::Client::new(),
203    }
204}
205
206fn build_client_async(http_config: HttpConfig) -> Client {
207    match http_config.timeout() {
208        Some(timeout) => Client::builder()
209            .timeout(Duration::from_millis(timeout))
210            .redirect(reqwest::redirect::Policy::limited(
211                http_config.max_redirect().unwrap_or(2),
212            ))
213            .cookie_store(http_config.cookie_store())
214            .build()
215            .unwrap_or_else(|_| Client::new()),
216        None => Client::new(),
217    }
218}
219
220fn clean_url(url: &str) -> String {
221    let mut result = url.to_string();
222
223    // Only remove trailing punctuation if parentheses are not balanced
224    let open_parens = url.chars().filter(|&c| c == '(').count();
225    let close_parens = url.chars().filter(|&c| c == ')').count();
226
227    // If parentheses are balanced, don't remove the closing parenthesis
228    if open_parens == close_parens {
229        result = result
230            .trim_end_matches(&['.', ',', ';', '!', '?', ']', '}'][..])
231            .to_string();
232    } else {
233        result = result
234            .trim_end_matches(&['.', ',', ';', '!', '?', ')', ']', '}'][..])
235            .to_string();
236    }
237
238    result
239}
240
241#[cfg(test)]
242mod tests {
243    use crate::http_config::HttpConfigBuilder;
244    use std::sync::{Arc, Mutex};
245    use tokio;
246
247    use super::*;
248
249    #[test]
250    fn test_new() {
251        let client = HttpClient::new();
252        assert_eq!(std::mem::size_of_val(&client), 0);
253    }
254
255    #[test]
256    fn test_extract_urls() {
257        let client = HttpClient::new();
258
259        let text = "Check out https://example.com and https://test.org for more info";
260        let urls = client.extract_urls(text);
261        assert_eq!(urls.len(), 2);
262        assert!(urls.contains(&"https://example.com".to_string()));
263        assert!(urls.contains(&"https://test.org".to_string()));
264
265        let text = "This text has no URLs";
266        let urls = client.extract_urls(text);
267        assert_eq!(urls.len(), 0);
268    }
269
270    #[test]
271    fn test_extract_urls_with_query_strings() {
272        let client = HttpClient::new();
273
274        // Test case 1: Sample text with query string
275        let text = "Cavafy lived in England for much of his adolescence, and developed both a command of the English language and a preference for the writings of William Shakespeare http://www.poetryfoundation.org/archive/poet.html?id=6176 and Oscar Wilde http://www.poetryfoundation.org/archive/poet.html?id=7425. Cavafy's older brothers mismanaged the family business in Liverpool, and Cavafy's mother was ultimately compelled to move the family back to Alexandria, where they lived until 1882.";
276        let urls = client.extract_urls(text);
277        assert_eq!(urls.len(), 2);
278        assert!(
279            urls.contains(&"http://www.poetryfoundation.org/archive/poet.html?id=6176".to_string())
280        );
281        assert!(
282            urls.contains(&"http://www.poetryfoundation.org/archive/poet.html?id=7425".to_string())
283        );
284
285        // Test case 2: Sample text with no query string
286        let text = "Rust is a general-purpose https://en.wikipedia.org/wiki/General-purpose_programming_language programming language https://en.wikipedia.org/wiki/Programming_language emphasizing performance https://en.wikipedia.org/wiki/Computer_performance, type safety https://en.wikipedia.org/wiki/Type_safety, and concurrency https://en.wikipedia.org/wiki/Concurrency_(computer_science). It enforces memory safety https://en.wikipedia.org/wiki/Memory_safety, meaning that all references point to valid memory.";
287        let urls = client.extract_urls(text);
288        assert_eq!(urls.len(), 6);
289        assert!(urls.contains(
290            &"https://en.wikipedia.org/wiki/General-purpose_programming_language".to_string()
291        ));
292        assert!(urls.contains(&"https://en.wikipedia.org/wiki/Programming_language".to_string()));
293        assert!(urls.contains(&"https://en.wikipedia.org/wiki/Computer_performance".to_string()));
294        assert!(urls.contains(&"https://en.wikipedia.org/wiki/Type_safety".to_string()));
295        assert!(
296            urls.contains(
297                &"https://en.wikipedia.org/wiki/Concurrency_(computer_science)".to_string()
298            )
299        );
300        assert!(urls.contains(&"https://en.wikipedia.org/wiki/Memory_safety".to_string()));
301
302        // Test case 3: Simple URL without query string
303        let text = "A language empowering everyone https://www.rust-lang.org/ to build reliable and efficient software.";
304        let urls = client.extract_urls(text);
305        assert_eq!(urls.len(), 1);
306        assert!(urls.contains(&"https://www.rust-lang.org/".to_string()));
307    }
308
309    #[test]
310    fn test_clean_url() {
311        assert_eq!(clean_url("https://example.com."), "https://example.com");
312        assert_eq!(clean_url("https://example.com,"), "https://example.com");
313        assert_eq!(clean_url("https://example.com!"), "https://example.com");
314        assert_eq!(clean_url("https://example.com"), "https://example.com");
315
316        // Test balanced parentheses (should not be removed)
317        assert_eq!(
318            clean_url("https://en.wikipedia.org/wiki/Concurrency_(computer_science)"),
319            "https://en.wikipedia.org/wiki/Concurrency_(computer_science)"
320        );
321
322        // Test unbalanced parentheses (should be removed)
323        assert_eq!(clean_url("https://example.com)"), "https://example.com");
324    }
325
326    #[test]
327    fn test_fetch_content_from_urls_empty() {
328        let client = HttpClient::new();
329        let urls: Vec<String> = vec![];
330        let results =
331            client.fetch_content_from_urls(urls, HttpConfigBuilder::new().timeout(30000).build());
332        assert_eq!(results.len(), 0);
333    }
334
335    #[test]
336    fn test_fetch_content_from_text_no_urls() {
337        let client = HttpClient::new();
338        let text = "This text has no URLs";
339        let results =
340            client.fetch_content_from_text(text, HttpConfigBuilder::new().timeout(30000).build());
341        assert_eq!(results.len(), 0);
342    }
343
344    #[tokio::test]
345    async fn test_fetch_content_from_text_async_no_urls() {
346        let client = HttpClient::new();
347        let text = "This text has no URLs";
348        let results = Arc::new(Mutex::new(Vec::new()));
349        let results_clone = results.clone();
350
351        let callback = move |url: Option<String>, content: Option<String>| {
352            let results = results_clone.clone();
353            async move {
354                let mut results = results.lock().unwrap();
355                results.push((url, content));
356            }
357        };
358
359        let result = client
360            .fetch_content_from_text_async(
361                text,
362                HttpConfigBuilder::new().timeout(30000).build(),
363                callback,
364            )
365            .await;
366
367        assert!(result.is_ok());
368        let results = results.lock().unwrap();
369        assert_eq!(results.len(), 1);
370        assert_eq!(results[0], (None, None));
371    }
372
373    #[tokio::test]
374    async fn test_fetch_content_from_text_async_with_urls() {
375        let client = HttpClient::new();
376        let text = "Check out https://httpbin.org/status/200 for testing";
377        let results = Arc::new(Mutex::new(Vec::new()));
378        let results_clone = results.clone();
379
380        let callback = move |url: Option<String>, content: Option<String>| {
381            let results = results_clone.clone();
382            async move {
383                let mut results = results.lock().unwrap();
384                results.push((url, content));
385            }
386        };
387
388        let result = client
389            .fetch_content_from_text_async(
390                text,
391                HttpConfigBuilder::new().timeout(30000).build(),
392                callback,
393            )
394            .await;
395
396        assert!(result.is_ok());
397        let results = results.lock().unwrap();
398        assert_eq!(results.len(), 1);
399        assert!(results[0].0.is_some());
400        assert!(results[0].1.is_some());
401    }
402
403    #[tokio::test]
404    async fn test_fetch_content_from_urls_async_empty() {
405        let client = HttpClient::new();
406        let urls: Vec<String> = vec![];
407        let results = Arc::new(Mutex::new(Vec::new()));
408        let results_clone = results.clone();
409
410        let callback = move |url: Option<String>, content: Option<String>| {
411            let results = results_clone.clone();
412            async move {
413                let mut results = results.lock().unwrap();
414                results.push((url, content));
415            }
416        };
417
418        let result = client
419            .fetch_content_from_urls_async(
420                urls,
421                HttpConfigBuilder::new().timeout(30000).build(),
422                callback,
423            )
424            .await;
425
426        assert!(result.is_ok());
427        let results = results.lock().unwrap();
428        assert_eq!(results.len(), 0);
429    }
430
431    #[tokio::test]
432    async fn test_fetch_content_from_urls_async_with_urls() {
433        let client = HttpClient::new();
434        let urls = vec!["https://httpbin.org/status/200".to_string()];
435        let results = Arc::new(Mutex::new(Vec::new()));
436        let results_clone = results.clone();
437
438        let callback = move |url: Option<String>, content: Option<String>| {
439            let results = results_clone.clone();
440            async move {
441                let mut results = results.lock().unwrap();
442                results.push((url, content));
443            }
444        };
445
446        let result = client
447            .fetch_content_from_urls_async(
448                urls,
449                HttpConfigBuilder::new().timeout(30000).build(),
450                callback,
451            )
452            .await;
453
454        assert!(result.is_ok());
455        let results = results.lock().unwrap();
456        assert_eq!(results.len(), 1);
457        assert!(results[0].0.is_some());
458        assert!(results[0].1.is_some());
459    }
460
461    #[tokio::test]
462    async fn test_handles_http_requests_results_async_empty() {
463        let urls: Vec<String> = vec![];
464        let results = Arc::new(Mutex::new(Vec::new()));
465        let results_clone = results.clone();
466
467        let callback = move |url: Option<String>, content: Option<String>| {
468            let results = results_clone.clone();
469            async move {
470                let mut results = results.lock().unwrap();
471                results.push((url, content));
472            }
473        };
474
475        let result = handles_http_requests_results_async(
476            urls,
477            HttpConfigBuilder::new().timeout(30000).build(),
478            callback,
479        )
480        .await;
481
482        assert!(result.is_ok());
483        let results = results.lock().unwrap();
484        assert_eq!(results.len(), 0);
485    }
486
487    #[tokio::test]
488    async fn test_handles_http_requests_results_async_with_urls() {
489        let urls = vec!["https://httpbin.org/status/200".to_string()];
490        let results = Arc::new(Mutex::new(Vec::new()));
491        let results_clone = results.clone();
492
493        let callback = move |url: Option<String>, content: Option<String>| {
494            let results = results_clone.clone();
495            async move {
496                let mut results = results.lock().unwrap();
497                results.push((url, content));
498            }
499        };
500
501        let result = handles_http_requests_results_async(
502            urls,
503            HttpConfigBuilder::new().timeout(30000).build(),
504            callback,
505        )
506        .await;
507
508        assert!(result.is_ok());
509        let results = results.lock().unwrap();
510        assert_eq!(results.len(), 1);
511        assert!(results[0].0.is_some());
512        assert!(results[0].1.is_some());
513    }
514
515    #[test]
516    fn test_build_client_async_with_timeout() {
517        let http_config = HttpConfigBuilder::new().timeout(5000).build();
518        let client = build_client_async(http_config);
519
520        // Verify the client was created successfully
521        assert_eq!(
522            std::mem::size_of_val(&client),
523            std::mem::size_of::<Client>()
524        );
525    }
526
527    #[test]
528    fn test_build_client_async_without_timeout() {
529        let http_config = HttpConfigBuilder::new().build();
530        let client = build_client_async(http_config);
531
532        // Verify the client was created successfully
533        assert_eq!(
534            std::mem::size_of_val(&client),
535            std::mem::size_of::<Client>()
536        );
537    }
538
539    #[test]
540    fn test_build_client_async_with_max_redirect() {
541        let http_config = HttpConfigBuilder::new()
542            .timeout(5000)
543            .max_redirect(5)
544            .build();
545        let client = build_client_async(http_config);
546
547        // Verify the client was created successfully
548        assert_eq!(
549            std::mem::size_of_val(&client),
550            std::mem::size_of::<Client>()
551        );
552    }
553}