use anyhow::Result;
use serde::Deserialize;
use std::future::Future;
use std::pin::Pin;
use super::Provider;
#[derive(Clone)]
pub struct CommonCrawlProvider {
index: String,
include_subdomains: bool,
proxy: Option<String>,
proxy_auth: Option<String>,
timeout: u64,
retries: u32,
random_agent: bool,
insecure: bool,
parallel: u32,
rate_limit: Option<f32>,
#[cfg(test)]
base_url: String,
}
#[derive(Deserialize)]
struct CCRecord {
url: String,
}
impl CommonCrawlProvider {
#[allow(dead_code)]
pub fn new() -> Self {
CommonCrawlProvider {
index: "CC-MAIN-2025-13".to_string(),
include_subdomains: false,
proxy: None,
proxy_auth: None,
timeout: 10,
retries: 3,
random_agent: true,
insecure: false,
parallel: 1,
rate_limit: None,
#[cfg(test)]
base_url: "https://index.commoncrawl.org".to_string(),
}
}
pub fn with_index(index: String) -> Self {
CommonCrawlProvider {
index,
include_subdomains: false,
proxy: None,
proxy_auth: None,
timeout: 10,
retries: 3,
random_agent: true,
insecure: false,
parallel: 1,
rate_limit: None,
#[cfg(test)]
base_url: "https://index.commoncrawl.org".to_string(),
}
}
}
impl Provider for CommonCrawlProvider {
fn clone_box(&self) -> Box<dyn Provider> {
Box::new(self.clone())
}
fn fetch_urls<'a>(
&'a self,
domain: &'a str,
) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'a>> {
Box::pin(async move {
#[cfg(test)]
let url_pattern = if self.include_subdomains {
format!(
"{}/{}-index?url=*.{}/*&output=json",
self.base_url, self.index, domain
)
} else {
format!(
"{}/{}-index?url={}/*&output=json",
self.base_url, self.index, domain
)
};
#[cfg(not(test))]
let url_pattern = if self.include_subdomains {
format!(
"https://index.commoncrawl.org/{}-index?url=*.{}/*&output=json",
self.index, domain
)
} else {
format!(
"https://index.commoncrawl.org/{}-index?url={}/*&output=json",
self.index, domain
)
};
let mut client_builder =
reqwest::Client::builder().timeout(std::time::Duration::from_secs(self.timeout));
if self.insecure {
client_builder = client_builder.danger_accept_invalid_certs(true);
}
if self.random_agent {
let ua = crate::network::random_user_agent();
client_builder = client_builder.user_agent(ua);
}
if let Some(proxy_url) = &self.proxy {
let mut proxy = reqwest::Proxy::all(proxy_url)?;
if let Some(auth) = &self.proxy_auth {
proxy = proxy.basic_auth(
auth.split(':').next().unwrap_or(""),
auth.split(':').nth(1).unwrap_or(""),
);
}
client_builder = client_builder.proxy(proxy);
}
let client = client_builder.build()?;
let mut last_error = None;
let mut attempt = 0;
while attempt <= self.retries {
if attempt > 0 {
tokio::time::sleep(std::time::Duration::from_millis(500 * attempt as u64))
.await;
}
match client.get(&url_pattern).send().await {
Ok(response) => {
if !response.status().is_success() {
attempt += 1;
last_error = Some(anyhow::anyhow!("HTTP error: {}", response.status()));
continue;
}
match response.text().await {
Ok(text) => {
if text.trim().is_empty() {
return Ok(Vec::new());
}
let mut urls = Vec::new();
for line in text.lines() {
if let Ok(record) = serde_json::from_str::<CCRecord>(line) {
urls.push(record.url);
}
}
urls.sort();
urls.dedup();
return Ok(urls);
}
Err(e) => {
attempt += 1;
last_error = Some(e.into());
continue;
}
}
}
Err(e) => {
attempt += 1;
last_error = Some(e.into());
continue;
}
}
}
if let Some(e) = last_error {
Err(anyhow::anyhow!(
"Failed after {} attempts: {}",
self.retries + 1,
e
))
} else {
Err(anyhow::anyhow!(
"Failed after {} attempts",
self.retries + 1
))
}
})
}
fn with_subdomains(&mut self, include: bool) {
self.include_subdomains = include;
}
fn with_proxy(&mut self, proxy: Option<String>) {
self.proxy = proxy;
}
fn with_proxy_auth(&mut self, auth: Option<String>) {
self.proxy_auth = auth;
}
fn with_timeout(&mut self, seconds: u64) {
self.timeout = seconds;
}
fn with_retries(&mut self, count: u32) {
self.retries = count;
}
fn with_random_agent(&mut self, enabled: bool) {
self.random_agent = enabled;
}
fn with_insecure(&mut self, enabled: bool) {
self.insecure = enabled;
}
fn with_parallel(&mut self, parallel: u32) {
self.parallel = parallel;
}
fn with_rate_limit(&mut self, rate_limit: Option<f32>) {
self.rate_limit = rate_limit;
}
}
#[cfg(test)]
pub struct MockCommonCrawlProvider {
mock_urls: Vec<String>,
include_subdomains: bool,
}
#[cfg(test)]
impl MockCommonCrawlProvider {
pub fn new(mock_urls: Vec<String>) -> Self {
MockCommonCrawlProvider {
mock_urls,
include_subdomains: false,
}
}
pub fn with_subdomains(&mut self, include: bool) -> &mut Self {
self.include_subdomains = include;
self
}
}
#[cfg(test)]
impl Provider for MockCommonCrawlProvider {
fn clone_box(&self) -> Box<dyn Provider> {
Box::new(Self {
mock_urls: self.mock_urls.clone(),
include_subdomains: self.include_subdomains,
})
}
fn fetch_urls<'a>(
&'a self,
_domain: &'a str,
) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'a>> {
let urls = self.mock_urls.clone();
Box::pin(async move { Ok(urls) })
}
fn with_subdomains(&mut self, include: bool) {
self.include_subdomains = include;
}
fn with_proxy(&mut self, _proxy: Option<String>) {}
fn with_proxy_auth(&mut self, _auth: Option<String>) {}
fn with_timeout(&mut self, _seconds: u64) {}
fn with_retries(&mut self, _count: u32) {}
fn with_random_agent(&mut self, _enabled: bool) {}
fn with_insecure(&mut self, _enabled: bool) {}
fn with_parallel(&mut self, _parallel: u32) {}
fn with_rate_limit(&mut self, _rate_limit: Option<f32>) {}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_provider() {
let provider = CommonCrawlProvider::new();
assert_eq!(provider.index, "CC-MAIN-2025-13");
assert!(!provider.include_subdomains);
assert_eq!(provider.proxy, None);
assert_eq!(provider.proxy_auth, None);
assert_eq!(provider.timeout, 10);
assert_eq!(provider.retries, 3);
assert!(provider.random_agent);
assert!(!provider.insecure);
assert_eq!(provider.parallel, 1);
assert_eq!(provider.rate_limit, None);
assert_eq!(provider.base_url, "https://index.commoncrawl.org");
}
#[test]
fn test_with_index() {
let index = "CC-MAIN-2023-06".to_string();
let provider = CommonCrawlProvider::with_index(index.clone());
assert_eq!(provider.index, index);
assert_eq!(provider.base_url, "https://index.commoncrawl.org");
}
#[test]
fn test_with_subdomains() {
let mut provider = CommonCrawlProvider::new();
provider.with_subdomains(true);
assert!(provider.include_subdomains);
}
#[test]
fn test_with_proxy() {
let mut provider = CommonCrawlProvider::new();
provider.with_proxy(Some("http://proxy.example.com:8080".to_string()));
assert_eq!(
provider.proxy,
Some("http://proxy.example.com:8080".to_string())
);
}
#[test]
fn test_with_proxy_auth() {
let mut provider = CommonCrawlProvider::new();
provider.with_proxy_auth(Some("user:pass".to_string()));
assert_eq!(provider.proxy_auth, Some("user:pass".to_string()));
}
#[test]
fn test_with_timeout() {
let mut provider = CommonCrawlProvider::new();
provider.with_timeout(60);
assert_eq!(provider.timeout, 60);
}
#[test]
fn test_with_retries() {
let mut provider = CommonCrawlProvider::new();
provider.with_retries(5);
assert_eq!(provider.retries, 5);
}
#[test]
fn test_with_random_agent() {
let mut provider = CommonCrawlProvider::new();
provider.with_random_agent(false);
assert!(!provider.random_agent);
}
#[test]
fn test_with_insecure() {
let mut provider = CommonCrawlProvider::new();
provider.with_insecure(true);
assert!(provider.insecure);
}
#[test]
fn test_with_parallel() {
let mut provider = CommonCrawlProvider::new();
provider.with_parallel(10);
assert_eq!(provider.parallel, 10);
}
#[test]
fn test_with_rate_limit() {
let mut provider = CommonCrawlProvider::new();
provider.with_rate_limit(Some(2.5));
assert_eq!(provider.rate_limit, Some(2.5));
}
#[test]
fn test_clone_box() {
let provider = CommonCrawlProvider::new();
let _cloned = provider.clone_box();
}
#[tokio::test]
#[ignore = "Skip tests that make actual network requests in CI"]
async fn test_fetch_urls_builds_correct_url_without_subdomains() {
let urls = vec![
"https://example.com/page1".to_string(),
"https://example.com/page2".to_string(),
];
let provider = MockCommonCrawlProvider::new(urls.clone());
let result = provider.fetch_urls("example.com").await;
assert!(result.is_ok());
let fetched_urls = result.unwrap();
assert_eq!(fetched_urls.len(), 2);
assert_eq!(fetched_urls[0], "https://example.com/page1");
assert_eq!(fetched_urls[1], "https://example.com/page2");
}
#[tokio::test]
#[ignore = "Skip tests that make actual network requests in CI"]
async fn test_fetch_urls_builds_correct_url_with_subdomains() {
let urls = vec![
"https://sub1.example.com/page1".to_string(),
"https://sub2.example.com/page2".to_string(),
];
let mut provider = MockCommonCrawlProvider::new(urls.clone());
provider.with_subdomains(true);
let result = provider.fetch_urls("example.com").await;
assert!(result.is_ok());
let fetched_urls = result.unwrap();
assert_eq!(fetched_urls.len(), 2);
assert_eq!(fetched_urls[0], "https://sub1.example.com/page1");
assert_eq!(fetched_urls[1], "https://sub2.example.com/page2");
}
#[test]
fn test_cc_record_deserialize() {
let json = r#"{"url":"https://example.com/test"}"#;
let record: CCRecord = serde_json::from_str(json).unwrap();
assert_eq!(record.url, "https://example.com/test");
}
#[test]
fn test_url_construction_without_subdomains() {
let provider = CommonCrawlProvider::new();
let url = format!(
"https://index.commoncrawl.org/{}-index?url={}/*&output=json",
provider.index, "example.com"
);
assert_eq!(
url,
"https://index.commoncrawl.org/CC-MAIN-2025-13-index?url=example.com/*&output=json"
);
}
#[test]
fn test_url_construction_with_subdomains() {
let mut provider = CommonCrawlProvider::new();
provider.with_subdomains(true);
let url = format!(
"https://index.commoncrawl.org/{}-index?url=*.{}/*&output=json",
provider.index, "example.com"
);
assert_eq!(
url,
"https://index.commoncrawl.org/CC-MAIN-2025-13-index?url=*.example.com/*&output=json"
);
}
}