mod args;
use anyhow::{Context, Result};
use clap::Parser;
use args::{Cli, Command, DelegateTarget, OutputFormat};
use stillo_core::{
document::{DelegationTarget, SpaDetection},
ContentExtractor, ExtractorConfig, MarkdownConfig, MarkdownSerializer,
};
use stillo_fetcher::{HttpConfig, HttpFetcher, SpaDelegationChain};
use stillo_renderer::{TuiBrowser, TuiResult};
use stillo_llm::{LlmProvider, CompletionConfig, prompts};
use stillo_mcp::McpServer;
use url::Url;
#[tokio::main]
async fn main() -> Result<()> {
let cli = Cli::parse();
let log_level = if cli.verbose { "debug" } else { "warn" };
tracing_subscriber::fmt()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(log_level)),
)
.with_writer(std::io::stderr)
.init();
match cli.command {
Some(Command::Dump { url, format, delegate, no_delegate }) => {
let fmt = format.unwrap_or(cli.format);
let del = delegate.or(cli.delegate);
let no_del = no_delegate || cli.no_delegate;
dump(&url, &fmt, cli.timeout, del.as_ref(), no_del).await?;
}
Some(Command::Browse { url }) => {
let del = cli.delegate.clone();
let no_del = cli.no_delegate;
browse(&url, cli.timeout, del.as_ref(), no_del).await?;
}
Some(Command::Qa { question, url }) => {
let del = cli.delegate.clone();
let no_del = cli.no_delegate;
qa(&question, &url, cli.timeout, del.as_ref(), no_del).await?;
}
Some(Command::Summarize { url }) => {
let del = cli.delegate.clone();
let no_del = cli.no_delegate;
summarize(&url, cli.timeout, del.as_ref(), no_del).await?;
}
Some(Command::Extract { fields, url, format }) => {
let fmt = format.unwrap_or(cli.format);
let del = cli.delegate.clone();
let no_del = cli.no_delegate;
extract_fields(&fields, &url, &fmt, cli.timeout, del.as_ref(), no_del).await?;
}
Some(Command::Mcp) => {
McpServer::new().run_stdio().await?;
}
None => {
if let Some(url) = cli.url {
browse(&url, cli.timeout, cli.delegate.as_ref(), cli.no_delegate).await?;
} else {
use clap::CommandFactory;
Cli::command().print_help()?;
}
}
}
Ok(())
}
async fn browse(
start_url: &Url,
timeout: u64,
delegate: Option<&DelegateTarget>,
no_delegate: bool,
) -> Result<()> {
let mut current_url = start_url.clone();
loop {
let raw = fetch_raw(¤t_url, timeout, delegate, no_delegate).await?;
let extractor = ContentExtractor::new(ExtractorConfig::default());
let content = extractor.extract(&raw).with_context(|| "failed to extract content")?;
let mut browser = TuiBrowser::new(content.clone());
match browser.run()? {
TuiResult::Navigate(next_url) => {
current_url = next_url;
}
TuiResult::Dump => {
let serializer = MarkdownSerializer::new(MarkdownConfig::default());
let doc = serializer.serialize(&content);
print!("{}", doc.content);
break;
}
TuiResult::Quit => break,
}
}
Ok(())
}
async fn fetch_raw(
url: &Url,
timeout: u64,
delegate: Option<&DelegateTarget>,
no_delegate: bool,
) -> Result<stillo_core::document::RawHtml> {
let http_config = HttpConfig {
timeout_secs: timeout,
..Default::default()
};
let fetcher = HttpFetcher::new(http_config);
let extractor = ContentExtractor::new(ExtractorConfig::default());
tracing::debug!("fetching {}", url);
let raw = fetcher.fetch(url).await.with_context(|| format!("failed to fetch {}", url))?;
tracing::debug!("fetched {} bytes (status={})", raw.bytes.len(), raw.status);
let raw = {
let frames = extractor.detect_frames(&raw);
if frames.is_empty() {
raw
} else {
tracing::debug!("frameset detected ({} frames), fetching frame contents", frames.len());
fetch_richest_frame(&fetcher, &extractor, frames).await.unwrap_or(raw)
}
};
if no_delegate {
return Ok(raw);
}
let detection = extractor
.detect_spa_for(&raw)
.with_context(|| "SPA detection failed")?;
match &detection {
SpaDetection::Static => Ok(raw),
SpaDetection::SuspectedSpa { text_length } => {
tracing::warn!("SPA suspected (text_length={}), trying delegation", text_length);
delegate_or_fallback(url, raw, delegate, timeout).await
}
SpaDetection::FrameworkDetected { framework } => {
tracing::warn!("JS framework detected ({:?}), trying delegation", framework);
delegate_or_fallback(url, raw, delegate, timeout).await
}
}
}
async fn fetch_richest_frame(
fetcher: &HttpFetcher,
extractor: &ContentExtractor,
frames: Vec<Url>,
) -> Option<stillo_core::document::RawHtml> {
let mut best: Option<(stillo_core::document::RawHtml, i64)> = None;
for url in frames {
let Ok(raw) = fetcher.fetch(&url).await else { continue };
if !extractor.detect_frames(&raw).is_empty() {
continue;
}
let url_str = raw.url.as_str().to_lowercase();
let nav_penalty: i64 = if url_str.contains("menu")
|| url_str.contains("nav")
|| url_str.contains("sidebar")
{
-100_000
} else {
0
};
let score = raw.bytes.len() as i64 + nav_penalty;
if best.as_ref().map_or(true, |(_, s)| score > *s) {
best = Some((raw, score));
}
}
best.map(|(raw, _)| raw)
}
async fn dump(
url: &Url,
format: &OutputFormat,
timeout: u64,
delegate: Option<&DelegateTarget>,
no_delegate: bool,
) -> Result<()> {
let extractor = ContentExtractor::new(ExtractorConfig::default());
let raw = fetch_raw(url, timeout, delegate, no_delegate).await?;
let content = extractor.extract(&raw).with_context(|| "failed to extract content")?;
match format {
OutputFormat::Markdown => {
let serializer = MarkdownSerializer::new(MarkdownConfig::default());
let doc = serializer.serialize(&content);
print!("{}", doc.content);
}
OutputFormat::Plain => {
println!("# {}", content.title);
println!();
print!("{}", content.body_text);
}
OutputFormat::Json => {
let json = serde_json::json!({
"url": content.url.as_str(),
"title": content.title,
"byline": content.byline,
"body_text": content.body_text,
"links": content.links.iter().map(|l| serde_json::json!({
"text": l.text,
"href": l.href.as_str(),
"rel": l.rel,
})).collect::<Vec<_>>(),
"metadata": {
"description": content.metadata.description,
"og_title": content.metadata.og_title,
"canonical": content.metadata.canonical.as_ref().map(|u| u.as_str()),
},
});
println!("{}", serde_json::to_string_pretty(&json)?);
}
}
Ok(())
}
async fn qa(
question: &str,
url: &Url,
timeout: u64,
delegate: Option<&DelegateTarget>,
no_delegate: bool,
) -> Result<()> {
let doc = fetch_as_markdown(url, timeout, delegate, no_delegate).await?;
let llm = LlmProvider::from_env().context("LLM provider not configured")?;
let messages = prompts::qa_prompt(question, &doc);
let answer = llm.complete(messages, &CompletionConfig::default()).await
.context("LLM request failed")?;
println!("{}", answer);
Ok(())
}
async fn summarize(
url: &Url,
timeout: u64,
delegate: Option<&DelegateTarget>,
no_delegate: bool,
) -> Result<()> {
let doc = fetch_as_markdown(url, timeout, delegate, no_delegate).await?;
let llm = LlmProvider::from_env().context("LLM provider not configured")?;
let messages = prompts::summarize_prompt(&doc);
let summary = llm.complete(messages, &CompletionConfig::default()).await
.context("LLM request failed")?;
println!("{}", summary);
Ok(())
}
async fn extract_fields(
fields: &str,
url: &Url,
format: &OutputFormat,
timeout: u64,
delegate: Option<&DelegateTarget>,
no_delegate: bool,
) -> Result<()> {
let doc = fetch_as_markdown(url, timeout, delegate, no_delegate).await?;
let llm = LlmProvider::from_env().context("LLM provider not configured")?;
let config = CompletionConfig { temperature: 0.0, ..Default::default() };
let messages = prompts::extract_prompt(fields, &doc);
let result = llm.complete(messages, &config).await
.context("LLM request failed")?;
if matches!(format, OutputFormat::Json) {
if let Ok(v) = serde_json::from_str::<serde_json::Value>(&result) {
println!("{}", serde_json::to_string_pretty(&v)?);
return Ok(());
}
}
println!("{}", result);
Ok(())
}
async fn fetch_as_markdown(
url: &Url,
timeout: u64,
delegate: Option<&DelegateTarget>,
no_delegate: bool,
) -> Result<stillo_core::document::MarkdownDocument> {
let extractor = ContentExtractor::new(ExtractorConfig::default());
let raw = fetch_raw(url, timeout, delegate, no_delegate).await?;
let content = extractor.extract(&raw).context("failed to extract content")?;
let serializer = MarkdownSerializer::new(MarkdownConfig::default());
Ok(serializer.serialize(&content))
}
async fn delegate_or_fallback(
url: &Url,
static_raw: stillo_core::document::RawHtml,
delegate: Option<&DelegateTarget>,
_timeout: u64,
) -> Result<stillo_core::document::RawHtml> {
let chain = build_delegation_chain(delegate);
match chain.fetch_with_js(url).await {
Ok(delegated) => Ok(delegated),
Err(e) => {
tracing::warn!(
"all delegation targets failed ({}), falling back to static HTML",
e
);
Ok(static_raw)
}
}
}
fn build_delegation_chain(delegate: Option<&DelegateTarget>) -> SpaDelegationChain {
match delegate {
None | Some(DelegateTarget::Auto) => SpaDelegationChain::from_env(9222),
Some(DelegateTarget::Cdp) => SpaDelegationChain::with_single_target(
DelegationTarget::LocalCdp { port: 9222 },
),
Some(DelegateTarget::Playwright) => SpaDelegationChain::with_single_target(
DelegationTarget::PlaywrightDaemon {
socket_path: "/tmp/stillo-playwright.sock".into(),
},
),
Some(DelegateTarget::Jina) => SpaDelegationChain::with_single_target(
DelegationTarget::JinaReader {
api_key: std::env::var("JINA_API_KEY").ok(),
},
),
Some(DelegateTarget::Firecrawl) => {
let base_url = std::env::var("FIRECRAWL_URL")
.ok()
.and_then(|u| u.parse().ok())
.unwrap_or_else(|| "https://api.firecrawl.dev/".parse().unwrap());
let api_key = std::env::var("FIRECRAWL_API_KEY").unwrap_or_default();
SpaDelegationChain::with_single_target(DelegationTarget::Firecrawl {
base_url,
api_key,
})
}
}
}