pub mod config;
mod crawl;
pub mod error;
pub(crate) mod extract;
pub(crate) mod fetch;
pub mod output;
pub(crate) mod preflight;
pub(crate) mod tokens;
pub use config::Web2llmConfig;
pub use crawl::CrawlConfig;
pub use error::Web2llmError;
pub use fetch::FetchMode;
pub use output::PageResult;
use std::collections::{HashSet, VecDeque};
use std::num::NonZeroU32;
use std::sync::Arc;
use crate::error::Result;
use crate::extract::PageElements;
use futures::stream::StreamExt;
use governor::{DefaultDirectRateLimiter, Quota, RateLimiter};
#[cfg(feature = "rendered")]
use tokio::sync::OnceCell;
use tokio::sync::Semaphore;
#[derive(Clone)]
pub struct Web2llm {
config: Web2llmConfig,
client: reqwest::Client,
limiter: Arc<DefaultDirectRateLimiter>,
semaphore: Arc<Semaphore>,
#[cfg(feature = "rendered")]
browser: Arc<OnceCell<chromiumoxide::Browser>>,
}
impl Web2llm {
pub fn new(config: Web2llmConfig) -> Result<Self> {
Self::validate_config(&config)?;
let client = reqwest::Client::builder()
.timeout(config.timeout)
.user_agent(&config.user_agent)
.build()
.unwrap_or_else(|_| reqwest::Client::new());
let limiter = Arc::new(RateLimiter::direct(Quota::per_second(
NonZeroU32::new(config.rate_limit).unwrap(),
)));
let semaphore = Arc::new(Semaphore::new(config.max_concurrency));
#[cfg(feature = "rendered")]
let browser = Arc::new(OnceCell::new());
Ok(Self {
config,
client,
limiter,
semaphore,
#[cfg(feature = "rendered")]
browser,
})
}
fn validate_config(config: &Web2llmConfig) -> Result<()> {
if config.rate_limit == 0 {
return Err(Web2llmError::Config(
"rate_limit must be greater than zero".to_string(),
));
}
if config.max_concurrency == 0 {
return Err(Web2llmError::Config(
"max_concurrency must be greater than zero".to_string(),
));
}
if config.sensitivity < 0.0 || config.sensitivity > 1.0 {
return Err(Web2llmError::Config(
"sensitivity must be between 0.0 and 1.0".to_string(),
));
}
Ok(())
}
pub async fn get_urls(&self, url: &str) -> Result<Vec<String>> {
let url = preflight::run_sync(url, self.config.block_private_hosts)?;
if self.config.robots_check {
preflight::robots::check_single(&url, &self.config.user_agent, &self.client).await?;
}
#[cfg(feature = "rendered")]
let elements = PageElements::parse(
url.clone(),
&self.client,
self.config.fetch_mode,
&self.browser,
)
.await?;
#[cfg(not(feature = "rendered"))]
let elements =
PageElements::parse(url.clone(), &self.client, self.config.fetch_mode).await?;
Ok(elements.get_urls())
}
#[inline(always)]
pub async fn fetch(&self, url: &str) -> Result<PageResult> {
let url = preflight::run_sync(url, self.config.block_private_hosts)?;
if self.config.robots_check {
preflight::robots::check_single(&url, &self.config.user_agent, &self.client).await?;
}
#[cfg(feature = "rendered")]
let elements =
PageElements::parse(url, &self.client, self.config.fetch_mode, &self.browser).await?;
#[cfg(not(feature = "rendered"))]
let elements = PageElements::parse(url, &self.client, self.config.fetch_mode).await?;
elements.into_result(&self.config)
}
pub async fn batch_fetch(&self, urls: Vec<String>) -> Vec<(String, Result<PageResult>)> {
let preflight_results = preflight::run_batch(
urls,
&self.config.user_agent,
self.config.block_private_hosts,
self.config.robots_check,
&self.client,
)
.await;
let mut final_results = Vec::with_capacity(preflight_results.len());
let mut to_fetch = Vec::new();
for (raw, res) in preflight_results {
match res {
Ok(url) => to_fetch.push((raw, url)),
Err(e) => final_results.push((raw, Err(e))),
}
}
let stream = futures::stream::iter(to_fetch).map(|(raw, url)| {
let engine = self.clone();
tokio::spawn(async move {
let res = async {
let _permit = engine.semaphore.acquire().await.map_err(|e| {
Web2llmError::Config(format!("Failed to acquire concurrency permit: {}", e))
})?;
engine.limiter.until_ready().await;
#[cfg(feature = "rendered")]
let elements = PageElements::parse(
url.clone(),
&engine.client,
engine.config.fetch_mode,
&engine.browser,
)
.await?;
#[cfg(not(feature = "rendered"))]
let elements =
PageElements::parse(url.clone(), &engine.client, engine.config.fetch_mode)
.await?;
elements.into_result(&engine.config)
}
.await;
(raw, res)
})
});
let mut fetched_results: Vec<(String, Result<PageResult>)> = if self.config.ordered {
stream
.buffered(self.config.max_concurrency)
.map(|res| res.expect("Task panicked during batch fetch"))
.collect()
.await
} else {
stream
.buffer_unordered(self.config.max_concurrency)
.map(|res| res.expect("Task panicked during batch fetch"))
.collect()
.await
};
final_results.append(&mut fetched_results);
final_results
}
pub async fn crawl(
&self,
url: &str,
crawl_config: CrawlConfig,
) -> Vec<(String, Result<PageResult>)> {
let seed = match crawl::normalize_url(url) {
Some(url) => url,
None => {
return vec![(
url.to_string(),
Err(Web2llmError::InvalidUrl(url.to_string())),
)];
}
};
let Some(seed_host) = seed.host_str().map(str::to_string) else {
return vec![(
url.to_string(),
Err(Web2llmError::InvalidUrl("URL has no host".to_string())),
)];
};
let seed_port = seed.port_or_known_default();
let seed_url = seed.to_string();
let mut discovered = Vec::from([seed_url.clone()]);
let mut visited = HashSet::from([seed_url.clone()]);
let mut frontier = VecDeque::from([seed_url]);
let mut depth = 0;
while depth < crawl_config.max_depth && !frontier.is_empty() {
let level_size = frontier.len();
let mut current_level = Vec::with_capacity(level_size);
for _ in 0..level_size {
if let Some(next_url) = frontier.pop_front() {
current_level.push(next_url);
}
}
for current_url in current_level {
let links = match self.get_urls(¤t_url).await {
Ok(links) => links,
Err(_) => continue,
};
for link in links {
let Some(normalized) = crawl::normalize_url(&link) else {
continue;
};
if !crawl::should_follow(&normalized, &seed_host, seed_port, &crawl_config) {
continue;
}
let normalized = normalized.to_string();
if visited.insert(normalized.clone()) {
discovered.push(normalized.clone());
frontier.push_back(normalized);
}
}
}
depth += 1;
}
self.batch_fetch(discovered).await
}
}
pub async fn fetch(url: String) -> Result<PageResult> {
Web2llm::new(Web2llmConfig::default())?.fetch(&url).await
}
pub async fn batch_fetch(urls: Vec<String>) -> Result<Vec<(String, Result<PageResult>)>> {
Ok(Web2llm::new(Web2llmConfig::default())?
.batch_fetch(urls)
.await)
}
pub async fn crawl(
url: String,
crawl_config: CrawlConfig,
) -> Result<Vec<(String, Result<PageResult>)>> {
Ok(Web2llm::new(Web2llmConfig::default())?
.crawl(&url, crawl_config)
.await)
}