use {
crate::{
browser::evaluate_json,
error::{LlmWebError, Result},
},
headless_chrome::{Tab, protocol::cdp::Page::CaptureScreenshotFormatOption},
std::sync::Arc,
};
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum Format {
#[default]
Html,
RawHtml,
Markdown,
Text,
Image,
}
#[derive(Debug, Clone, Default)]
pub struct RunOptions {
pub format: Format,
pub system: Option<String>,
pub temperature: Option<f64>,
pub top_p: Option<f64>,
pub max_tokens: Option<u32>,
}
impl RunOptions {
pub fn new(format: Format) -> Self {
Self {
format,
..Default::default()
}
}
}
#[derive(Debug, Clone)]
pub struct Preprocessed {
pub url: String,
pub content: String,
pub format: Format,
}
impl Preprocessed {
pub fn image_mime(&self) -> &'static str {
"image/png"
}
}
const CLEANUP_JS: &str = r#"
(() => {
const elementsToRemove = ['script','style','noscript','iframe','svg','img','audio','video','canvas','map','source','dialog','menu','menuitem','track','object','embed','form','input','button','select','textarea','label','option','optgroup','aside','footer','header','nav','head'];
const attributesToRemove = ['style','src','alt','title','role','aria-','tabindex','on','data-'];
document.querySelectorAll('*').forEach((el) => {
if (elementsToRemove.includes(el.tagName.toLowerCase())) { el.remove(); return; }
Array.from(el.attributes).forEach((attr) => {
if (attributesToRemove.some((a) => attr.name.startsWith(a))) el.removeAttribute(attr.name);
});
});
return true;
})()
"#;
pub fn preprocess_html(html: &str, format: Format) -> Result<Preprocessed> {
let content = match format {
Format::Html | Format::RawHtml => html.to_string(),
Format::Markdown => htmd::convert(html)
.map_err(|e| LlmWebError::Preprocess(format!("htmd: {e}")))?,
Format::Text => extract_text_from_html(html),
Format::Image => {
return Err(LlmWebError::Preprocess(
"Format::Image requires a browser tab — use a *_on_tab method".into(),
));
}
};
Ok(Preprocessed {
url: "inline://html".to_string(),
content,
format,
})
}
fn extract_text_from_html(html: &str) -> String {
let doc = scraper::Html::parse_document(html);
let body_sel = scraper::Selector::parse("body").ok();
let body = body_sel.as_ref().and_then(|s| doc.select(s).next());
let text: String = match body {
Some(el) => el.text().collect(),
None => doc.root_element().text().collect(),
};
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub async fn preprocess(tab: &Arc<Tab>, format: Format) -> Result<Preprocessed> {
let url = tab.get_url();
let content = match format {
Format::Html => {
let _ = evaluate_json(tab, CLEANUP_JS)
.map_err(|e| LlmWebError::Preprocess(format!("cleanup: {e}")))?;
tab.get_content()
.map_err(|e| LlmWebError::Preprocess(format!("get_content: {e}")))?
}
Format::RawHtml => tab
.get_content()
.map_err(|e| LlmWebError::Preprocess(format!("get_content: {e}")))?,
Format::Markdown => {
let body_html = evaluate_json(tab, "document.body.innerHTML")
.map_err(|e| LlmWebError::Preprocess(format!("body innerHTML: {e}")))?;
let body_html = body_html
.as_str()
.ok_or_else(|| LlmWebError::Preprocess("innerHTML not a string".into()))?;
htmd::convert(body_html)
.map_err(|e| LlmWebError::Preprocess(format!("htmd: {e}")))?
}
Format::Text => {
let txt = evaluate_json(tab, "document.body.innerText")
.map_err(|e| LlmWebError::Preprocess(format!("body innerText: {e}")))?;
txt.as_str()
.ok_or_else(|| LlmWebError::Preprocess("innerText not a string".into()))?
.to_string()
}
Format::Image => {
let bytes = tab
.capture_screenshot(CaptureScreenshotFormatOption::Png, None, None, true)
.map_err(|e| LlmWebError::Preprocess(format!("screenshot: {e}")))?;
use base64::Engine;
base64::engine::general_purpose::STANDARD.encode(bytes)
}
};
Ok(Preprocessed { url, content, format })
}