#[cfg(feature = "rendered")]
pub(crate) mod dynamic_fetch;
pub(crate) mod static_fetch;
use crate::error::Result;
#[cfg(not(feature = "rendered"))]
use crate::error::Web2llmError;
use url::Url;
#[cfg(feature = "rendered")]
use tokio::sync::OnceCell;
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum FetchMode {
Static,
Dynamic,
#[default]
Auto,
}
#[inline(always)]
pub(crate) async fn get_html(
url: &Url,
client: &reqwest::Client,
mode: FetchMode,
#[cfg(feature = "rendered")] browser: &OnceCell<chromiumoxide::Browser>,
) -> Result<(String, bool)> {
match mode {
FetchMode::Static => {
let html = static_fetch::get_html(url, client).await?;
Ok((html, false))
}
FetchMode::Dynamic => {
#[cfg(feature = "rendered")]
{
let html = dynamic_fetch::get_html(url, browser).await?;
Ok((html, true))
}
#[cfg(not(feature = "rendered"))]
{
Err(Web2llmError::Config(
"Feature 'rendered' is required for dynamic fetching".to_string(),
))
}
}
FetchMode::Auto => {
let html = static_fetch::get_html(url, client).await?;
if is_spa(&html) {
#[cfg(feature = "rendered")]
{
let dynamic_html = dynamic_fetch::get_html(url, browser).await?;
Ok((dynamic_html, true))
}
#[cfg(not(feature = "rendered"))]
{
Ok((html, false))
}
} else {
Ok((html, false))
}
}
}
}
pub fn is_spa(html: &str) -> bool {
let low = html.to_lowercase();
let len = html.len();
if (low.contains("<noscript") && (low.contains("javascript") || low.contains("enable js")))
|| low.contains("ng-version=")
|| low.contains("data-reactroot")
|| low.contains("data-server-rendered")
{
return true;
}
if low.contains("name=\"fragment\" content=\"!\"")
|| low.contains("window.__initial_state__")
|| low.contains("window.__next_data__")
{
return true;
}
let has_root_container = low.contains("id=\"app\"")
|| low.contains("id=\"root\"")
|| low.contains("id=\"__next\"")
|| low.contains("id=\"__nuxt\"")
|| low.contains("id=\"___gatsby\"")
|| low.contains("id=\"app-root\"")
|| low.contains("<app-root")
|| low.contains("id=\"ember-application\"");
if has_root_container && len < 15360 {
return true;
}
if len < 20480
&& (low.contains(".chunk.js")
|| low.contains("bundle.js")
|| low.contains("vendor.js")
|| low.contains("_next/static"))
{
return true;
}
false
}