use crate::{PreviewError, Preview};
use crate::MetadataExtractor;
use crate::mcp_client::{McpClient, McpConfig, BrowserUsagePolicy};
use std::sync::Arc;
use url::Url;
#[cfg(feature = "logging")]
use tracing::{debug, warn, instrument};
pub struct BrowserFetcher {
mcp_client: Arc<McpClient>,
usage_policy: BrowserUsagePolicy,
metadata_extractor: Arc<MetadataExtractor>,
}
impl BrowserFetcher {
pub fn new(config: McpConfig, usage_policy: BrowserUsagePolicy) -> Self {
Self {
mcp_client: Arc::new(McpClient::new(config)),
usage_policy,
metadata_extractor: Arc::new(MetadataExtractor::new()),
}
}
pub async fn initialize(&self) -> Result<(), PreviewError> {
self.mcp_client.start().await
}
pub async fn shutdown(&self) -> Result<(), PreviewError> {
self.mcp_client.stop().await
}
pub fn should_use_browser(&self, url: &str) -> bool {
match self.usage_policy {
BrowserUsagePolicy::Always => true,
BrowserUsagePolicy::Never => false,
BrowserUsagePolicy::Auto => self.detect_browser_need(url),
}
}
fn detect_browser_need(&self, url: &str) -> bool {
let parsed = match Url::parse(url) {
Ok(u) => u,
Err(_) => return false,
};
let domain = parsed.host_str().unwrap_or("");
let spa_domains = [
"twitter.com", "x.com",
"instagram.com",
"facebook.com",
"linkedin.com",
"reddit.com",
"discord.com",
"slack.com",
"notion.so",
"vercel.app",
"netlify.app",
"web.app", ];
for spa_domain in &spa_domains {
if domain.ends_with(spa_domain) {
return true;
}
}
let path = parsed.path();
let spa_indicators = ["#/", "#!/", "/app/", "/dashboard/"];
for indicator in &spa_indicators {
if path.contains(indicator) {
return true;
}
}
false
}
#[cfg_attr(feature = "logging", instrument(skip(self)))]
pub async fn fetch_with_browser(&self, url: &str) -> Result<String, PreviewError> {
#[cfg(feature = "logging")]
debug!("Fetching content with browser for URL: {}", url);
self.mcp_client.navigate(url).await?;
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
let html = self.mcp_client.get_page_html().await?;
#[cfg(feature = "logging")]
debug!("Successfully fetched {} bytes of HTML", html.len());
Ok(html)
}
#[cfg_attr(feature = "logging", instrument(skip(self)))]
pub async fn generate_preview(&self, url: &str) -> Result<Preview, PreviewError> {
let html = self.fetch_with_browser(url).await?;
self.metadata_extractor.extract(&html, url)
}
#[cfg_attr(feature = "logging", instrument(skip(self)))]
pub async fn take_screenshot(&self, url: &str) -> Result<Vec<u8>, PreviewError> {
#[cfg(feature = "logging")]
debug!("Taking screenshot of URL: {}", url);
self.mcp_client.navigate(url).await?;
self.mcp_client.wait_for_load().await?;
self.mcp_client.take_screenshot().await
}
#[cfg_attr(feature = "logging", instrument(skip(self, script)))]
pub async fn extract_with_script<T>(&self, url: &str, script: &str) -> Result<T, PreviewError>
where
T: serde::de::DeserializeOwned,
{
self.mcp_client.navigate(url).await?;
self.mcp_client.wait_for_load().await?;
let result = self.mcp_client.evaluate(script).await?;
serde_json::from_value(result)
.map_err(|e| PreviewError::ParseError(format!("Failed to parse extraction result: {}", e)))
}
}
pub struct BrowserPreviewService {
browser_fetcher: Arc<BrowserFetcher>,
fallback_fetcher: Arc<crate::Fetcher>,
metadata_extractor: Arc<MetadataExtractor>,
}
impl BrowserPreviewService {
pub fn new(mcp_config: McpConfig, usage_policy: BrowserUsagePolicy) -> Self {
Self {
browser_fetcher: Arc::new(BrowserFetcher::new(mcp_config, usage_policy)),
fallback_fetcher: Arc::new(crate::Fetcher::new()),
metadata_extractor: Arc::new(MetadataExtractor::new()),
}
}
pub async fn initialize(&self) -> Result<(), PreviewError> {
self.browser_fetcher.initialize().await
}
pub fn should_use_browser(&self, url: &str) -> bool {
self.browser_fetcher.should_use_browser(url)
}
#[cfg_attr(feature = "logging", instrument(skip(self)))]
pub async fn generate_preview(&self, url: &str) -> Result<Preview, PreviewError> {
if self.browser_fetcher.should_use_browser(url) {
#[cfg(feature = "logging")]
debug!("Using browser for URL: {}", url);
match self.browser_fetcher.generate_preview(url).await {
Ok(preview) => Ok(preview),
Err(_e) => {
#[cfg(feature = "logging")]
warn!("Browser fetch failed, falling back to standard fetch: {}", _e);
let fetch_result = self.fallback_fetcher.fetch(url).await?;
let html = match fetch_result {
crate::FetchResult::Html(h) => h,
_ => return Err(PreviewError::InvalidContentType("Expected HTML".to_string())),
};
self.metadata_extractor.extract(&html, url)
}
}
} else {
#[cfg(feature = "logging")]
debug!("Using standard fetch for URL: {}", url);
let fetch_result = self.fallback_fetcher.fetch(url).await?;
let html = match fetch_result {
crate::FetchResult::Html(h) => h,
_ => return Err(PreviewError::InvalidContentType("Expected HTML".to_string())),
};
self.metadata_extractor.extract(&html, url)
}
}
}
impl Drop for BrowserPreviewService {
fn drop(&mut self) {
let fetcher = self.browser_fetcher.clone();
tokio::spawn(async move {
let _ = fetcher.shutdown().await;
});
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_browser_detection() {
let config = McpConfig::default();
let fetcher = BrowserFetcher::new(config, BrowserUsagePolicy::Auto);
assert!(fetcher.detect_browser_need("https://twitter.com/home"));
assert!(fetcher.detect_browser_need("https://app.netlify.app"));
assert!(fetcher.detect_browser_need("https://example.com/#!/page"));
assert!(fetcher.detect_browser_need("https://app.example.com/dashboard/"));
assert!(!fetcher.detect_browser_need("https://example.com"));
assert!(!fetcher.detect_browser_need("https://blog.example.com/post"));
}
#[test]
fn test_usage_policy() {
let config = McpConfig::default();
let fetcher = BrowserFetcher::new(config.clone(), BrowserUsagePolicy::Always);
assert!(fetcher.should_use_browser("https://example.com"));
let fetcher = BrowserFetcher::new(config.clone(), BrowserUsagePolicy::Never);
assert!(!fetcher.should_use_browser("https://twitter.com"));
let fetcher = BrowserFetcher::new(config, BrowserUsagePolicy::Auto);
assert!(fetcher.should_use_browser("https://twitter.com"));
assert!(!fetcher.should_use_browser("https://example.com"));
}
}