use resext::ctx;
use std::{
collections::HashSet,
sync::{Arc, LazyLock},
time::{Duration, Instant},
};
use reqwest::{Client, Response};
use scraper::{Html, Selector};
use tokio::{sync::Mutex, time::sleep};
use url::Url;
use crate::{
UrlRepo,
error::{Log, Res, ResExt},
fetch::*,
match_option,
output::write_output,
};
pub struct CrawnClient {
client: Client,
next_req: Mutex<Instant>,
}
impl CrawnClient {
pub fn new() -> Res<Self> {
Ok(Self {
client: Client::builder()
.timeout(Duration::from_secs(10))
.build()
.context("Failed to build client")?,
next_req: Mutex::new(Instant::now()),
})
}
pub async fn get(&self, url: &str) -> Res<Response> {
let mut next_req = self.next_req.lock().await;
let now = Instant::now();
if now < *next_req {
sleep(*next_req - now).await;
}
let res = self
.client
.get(url)
.send()
.await
.context(ctx!("Failed to fetch URL: {}", url));
*next_req = Instant::now() + Duration::from_millis(rand::random_range(300..601));
res
}
pub async fn timeout(&self, time: Duration) {
*self.next_req.lock().await = Instant::now() + time;
}
}
pub struct Selectors {
pub anchor: Selector,
pub title: Selector,
pub body: Option<Selector>,
}
pub async fn worker<R: UrlRepo>(
repo: Arc<Mutex<R>>,
selectors: Arc<Selectors>,
client: Arc<CrawnClient>,
url: String,
can_extract: bool,
) -> Res<()> {
let args = &*crate::ARGS;
let client = Arc::clone(&client);
let base = Url::parse(&url).context(ctx!("Failed to parse URL: {}", &url))?;
let content = fetch_url(&url, client).await?;
if args.verbose {
format!("Fetched content from URL: {}", &url).log().await?;
}
let (links, title, text, content) = {
let selectors = Arc::clone(&selectors);
let mut link_count = 0usize;
let task = tokio::task::spawn_blocking(move || {
let doc = Html::parse_document(&content);
let links = if can_extract {
extract_links(&doc, Arc::new(base), &selectors.anchor)
} else {
Vec::new()
};
let text = selectors
.body
.as_ref()
.map(|body_selector| extract_text(&doc, body_selector));
let title = extract_title(&doc, &selectors.title);
(
text,
title,
links,
if args.include_content {
Some(content)
} else {
None
},
)
});
let (text, title, links, content) = task
.await
.context("Failed to extract links and text from HTML body concurrently")?;
{
let temp = Arc::clone(&repo);
let mut rp = temp.lock().await;
for link in links {
let link = match_option!(link.log().await?);
let link = match_option!(normalize_url(link).log().await?);
match_option!(rp.add(link).await.log().await?);
link_count += 1;
}
}
(link_count, title, text, content)
};
write_output(url, title, links, text, content)
.await
.context("Failed to write output entry for URL")?;
Ok(())
}
static GENERICS: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| HashSet::from(["tutorial", "guide", "blog"]));
pub fn should_crawl(
base_domain: Arc<String>,
base_keywords: Arc<HashSet<String>>,
other: &Url,
) -> bool {
if let Some(other_domain) = other.domain() {
if other_domain != base_domain.as_str() {
return false;
}
} else {
return false;
}
let other_keywords = get_keywords(other);
let match_count = other_keywords
.iter()
.filter(|kw| base_keywords.contains(kw.as_str()) || GENERICS.contains(&kw.as_str()))
.count();
if match_count >= 2 {
return true;
}
true
}
static STOP_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
HashSet::from([
"how",
"to",
"the",
"and",
"for",
"with",
"from",
"about",
"by",
"category",
"catalogue",
])
});
pub fn get_keywords(url: &Url) -> HashSet<String> {
let mut url = url.clone();
url.set_query(None);
url.set_fragment(None);
let path = url.path().to_lowercase();
path.split(['/', '-', '_'])
.filter(|s| {
!s.chars().all(|c| c.is_numeric())
&& !s.is_empty()
&& s.len() >= 3
&& !STOP_WORDS.contains(s)
})
.map(|s| s.chars().filter(|c| c.is_ascii_alphanumeric()).collect())
.collect()
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use url::Url;
use crate::{
crawler::get_keywords,
error::{Res, ResExt},
};
#[test]
fn test_keyword_extraction() -> Res<()> {
let url = Url::parse(
"https://example.com/rust-programming-language/category/async/tokio/beginner_tutorial",
)
.context("Failed to parse URL")?;
let kws = get_keywords(&url);
assert_eq!(
kws,
HashSet::from([
"rust".to_string(),
"programming".to_string(),
"language".to_string(),
"async".to_string(),
"tokio".to_string(),
"beginner".to_string(),
"tutorial".to_string()
])
);
Ok(())
}
}