use anyhow::Result;
use scraper::{Html, Selector};
use std::future::Future;
use std::pin::Pin;
use url::Url;
use super::Tester;
#[derive(Clone)]
pub struct LinkExtractor {
proxy: Option<String>,
proxy_auth: Option<String>,
timeout: u64,
retries: u32,
random_agent: bool,
insecure: bool,
}
impl LinkExtractor {
pub fn new() -> Self {
LinkExtractor {
proxy: None,
proxy_auth: None,
timeout: 30,
retries: 3,
random_agent: false,
insecure: false,
}
}
}
impl Tester for LinkExtractor {
fn clone_box(&self) -> Box<dyn Tester> {
Box::new(self.clone())
}
fn test_url<'a>(
&'a self,
url: &'a str,
) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'a>> {
Box::pin(async move {
let mut client_builder =
reqwest::Client::builder().timeout(std::time::Duration::from_secs(self.timeout));
if self.insecure {
client_builder = client_builder.danger_accept_invalid_certs(true);
}
if self.random_agent {
let ua = crate::network::random_user_agent();
client_builder = client_builder.user_agent(ua);
}
if let Some(proxy_url) = &self.proxy {
let mut proxy = reqwest::Proxy::all(proxy_url)?;
if let Some(auth) = &self.proxy_auth {
let parts: Vec<&str> = auth.splitn(2, ':').collect();
if parts.len() == 2 {
proxy = proxy.basic_auth(parts[0], parts[1]);
}
}
client_builder = client_builder.proxy(proxy);
}
let client = client_builder.build()?;
let mut last_error = None;
let mut links = Vec::new();
for _ in 0..=self.retries {
match client.get(url).send().await {
Ok(response) => {
let base_url = match Url::parse(url) {
Ok(parsed_url) => parsed_url,
Err(_) => {
return Err(anyhow::anyhow!("Failed to parse URL: {}", url));
}
};
let html_content = response.text().await?;
let document = Html::parse_document(&html_content);
let selector = Selector::parse("a[href]").unwrap();
for element in document.select(&selector) {
if let Some(href) = element.value().attr("href") {
if let Ok(absolute_url) = base_url.join(href) {
links.push(absolute_url.to_string());
}
}
}
return Ok(links);
}
Err(e) => {
last_error = Some(e);
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
continue;
}
}
}
Err(anyhow::anyhow!(
"Failed to extract links from {}: {:?}",
url,
last_error
))
})
}
fn with_timeout(&mut self, seconds: u64) {
self.timeout = seconds;
}
fn with_retries(&mut self, count: u32) {
self.retries = count;
}
fn with_random_agent(&mut self, enabled: bool) {
self.random_agent = enabled;
}
fn with_insecure(&mut self, enabled: bool) {
self.insecure = enabled;
}
fn with_proxy(&mut self, proxy: Option<String>) {
self.proxy = proxy;
}
fn with_proxy_auth(&mut self, auth: Option<String>) {
self.proxy_auth = auth;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_link_extractor_new() {
let extractor = LinkExtractor::new();
assert_eq!(extractor.timeout, 30);
assert_eq!(extractor.retries, 3);
assert!(!extractor.random_agent);
assert!(!extractor.insecure);
assert_eq!(extractor.proxy, None);
assert_eq!(extractor.proxy_auth, None);
}
#[test]
fn test_link_extractor_with_timeout() {
let mut extractor = LinkExtractor::new();
extractor.with_timeout(60);
assert_eq!(extractor.timeout, 60);
}
#[test]
fn test_link_extractor_with_retries() {
let mut extractor = LinkExtractor::new();
extractor.with_retries(5);
assert_eq!(extractor.retries, 5);
}
#[test]
fn test_link_extractor_with_random_agent() {
let mut extractor = LinkExtractor::new();
extractor.with_random_agent(true);
assert!(extractor.random_agent);
}
#[test]
fn test_link_extractor_with_insecure() {
let mut extractor = LinkExtractor::new();
extractor.with_insecure(true);
assert!(extractor.insecure);
}
#[test]
fn test_link_extractor_with_proxy() {
let mut extractor = LinkExtractor::new();
extractor.with_proxy(Some("http://proxy.example.com:8080".to_string()));
assert_eq!(
extractor.proxy,
Some("http://proxy.example.com:8080".to_string())
);
}
#[test]
fn test_link_extractor_with_proxy_auth() {
let mut extractor = LinkExtractor::new();
extractor.with_proxy_auth(Some("username:password".to_string()));
assert_eq!(extractor.proxy_auth, Some("username:password".to_string()));
}
#[test]
fn test_link_extractor_clone_box() {
let extractor = LinkExtractor::new();
let _cloned = extractor.clone_box();
}
}