1use anyhow::Context;
9use jiff::Timestamp;
10use std::path::Path;
11use url::Url;
12
13use crate::config;
14use crate::extractor::frontmatter::{PageMeta, render};
15use crate::extractor::pipeline::extract;
16use crate::fetcher::cached::{
17 CacheStatus, ExtractResult, FetchOptions, fetch_with_cache, sha256_hex,
18};
19use crate::fetcher::client::build_http_client;
20use crate::fetcher::ssrf::SsrfLevel;
21use crate::storage::Db;
22
23pub struct Args {
24 pub url: String,
25 pub force_refresh: bool,
26 pub ignore_robots: bool,
27
28 pub user_agent: Option<String>,
30
31 pub timeout_secs: Option<u64>,
33
34 pub rate_limit_rpm: Option<u32>,
35 pub per_host_concurrency: Option<u32>,
36 pub global_concurrency: Option<u32>,
37 pub max_retries: Option<u8>,
38
39 pub max_tokens: Option<usize>,
44
45 pub summarize: Option<String>,
50}
51
52pub async fn run(args: Args, config_path: Option<&Path>) -> anyhow::Result<()> {
53 let mut cfg = config::load_resolved(config_path).context("loading config")?;
54 cfg.apply_overrides(
55 args.rate_limit_rpm,
56 args.per_host_concurrency,
57 args.global_concurrency,
58 args.max_retries,
59 args.ignore_robots,
60 );
61 if let Some(ua) = args.user_agent {
65 cfg.fetch.user_agent = ua;
66 }
67 if let Some(t) = args.timeout_secs {
68 if t == 0 {
69 anyhow::bail!("--timeout-secs must be greater than 0");
70 }
71 cfg.fetch.timeout_secs = t;
72 }
73 let url = Url::parse(&args.url).context("parsing URL argument")?;
74 let level = SsrfLevel::parse(&cfg.ssrf.level)
75 .with_context(|| format!("invalid [ssrf] level `{}` in config", cfg.ssrf.level))?;
76 let ssrf_project_root = if level == SsrfLevel::Project {
77 let raw = &cfg.ssrf.project_root;
78 let resolved = std::fs::canonicalize(raw)
79 .with_context(|| format!("canonicalizing ssrf.project_root `{}`", raw.display()))?;
80 tracing::info!(
81 target: "rover::ssrf",
82 project_root = %resolved.display(),
83 "ssrf level=project; project_root resolved",
84 );
85 Some(resolved)
86 } else {
87 None
88 };
89
90 let summarize_opts: Option<crate::mcp::tools::fetch::InlineSummarizeArgs> =
93 match args.summarize.as_deref() {
94 Some(s) => Some(serde_json::from_str(s).context("parsing --summarize JSON")?),
95 None => None,
96 };
97 if matches!(args.max_tokens, Some(0)) {
98 anyhow::bail!("--max-tokens must be greater than 0");
99 }
100
101 let data_dir = crate::paths::data_dir();
102 std::fs::create_dir_all(&data_dir).context("creating data dir")?;
103 let db = Db::open(data_dir.join("rover.db"))
104 .await
105 .context("opening cache database")?;
106
107 let client = build_http_client(&cfg.fetch.user_agent, cfg.fetch.timeout());
108 let pacer = crate::fetcher::concurrency::Pacer::new(&cfg.rate_limit);
109
110 let har_recorder: Option<std::sync::Arc<crate::fetcher::har::HarRecorder>> =
114 if !cfg.debug.har_path.is_empty() {
115 let path = std::path::PathBuf::from(&cfg.debug.har_path);
116 let r = crate::fetcher::har::HarRecorder::new(path, cfg.debug.har_body_cap)
117 .with_context(|| format!("opening har file at {}", cfg.debug.har_path))?;
118 Some(std::sync::Arc::new(r))
119 } else {
120 None
121 };
122
123 let headless_mode = if cfg.headless.auto_detect_spa {
129 crate::fetcher::HeadlessMode::Auto
130 } else {
131 crate::fetcher::HeadlessMode::Off
132 };
133 #[cfg(feature = "headless")]
139 let headless: Option<crate::fetcher::headless::HeadlessHandle> =
140 if !matches!(headless_mode, crate::fetcher::HeadlessMode::Off) {
141 Some(crate::fetcher::headless::HeadlessHandle::new(
142 cfg.headless.clone(),
143 ))
144 } else {
145 None
146 };
147
148 let result = fetch_with_cache(
149 &db,
150 &client,
151 &pacer,
152 &cfg.rate_limit,
153 &cfg.robots,
154 &url,
155 &cfg.cache,
156 FetchOptions {
157 force_refresh: args.force_refresh,
158 ssrf_level: level,
159 ssrf_project_root,
160 har_recorder: har_recorder.clone(),
161 ignore_robots: args.ignore_robots,
162 user_agent: cfg.fetch.user_agent.clone(),
163 #[cfg(feature = "headless")]
164 headless: headless.clone(),
165 headless_mode,
166 synchronous_revalidation: true,
172 },
173 |body, base| {
174 let extracted =
175 extract(body, Some(base)).map_err(crate::fetcher::FetcherError::Extract)?;
176 let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
177 Ok(ExtractResult {
178 title: extracted.title,
179 body_md: extracted.body_md,
180 content_hash,
181 metadata: extracted.metadata,
182 })
183 },
184 )
185 .await;
186
187 #[cfg(feature = "headless")]
194 if let Some(h) = headless {
195 h.shutdown().await;
196 }
197
198 let result = result.context("fetching URL")?;
199
200 if matches!(result.cache_status, CacheStatus::Stale { .. }) {
201 tracing::warn!(
202 target: "rover::cli::fetch",
203 url = url.as_str(),
204 "serving stale cache entry (network unavailable)"
205 );
206 }
207
208 let canonical =
209 Url::parse(&result.page.canonical_url).context("parsing canonical URL from cache row")?;
210
211 let family = cfg.tokenizer.default;
213 crate::tokenizer::ensure_loaded(family)
214 .await
215 .context("loading default tokenizer")?;
216 let original_tokens = crate::tokenizer::count(&result.page.extracted_md, family)
217 .context("counting tokens for frontmatter")?;
218
219 let metadata: crate::extractor::ExtractedMetadata = result
223 .page
224 .metadata_json
225 .as_deref()
226 .and_then(|s| serde_json::from_str(s).ok())
227 .unwrap_or_default();
228 let quality = crate::extractor::quality::score(
231 &result.page.extracted_md,
232 result.page.extracted_md.chars().count().max(1),
233 !metadata.is_empty(),
234 result.page.title.is_some(),
235 );
236
237 let (body_md, tokens, summarized) = if args.max_tokens.is_some() || summarize_opts.is_some() {
241 let registry = std::sync::Arc::new(
242 crate::summarizer::registry::build(&cfg, family)
243 .context("building summarizer registry")?,
244 );
245 let guard = std::sync::Arc::new(
249 crate::guard::Guard::from_config(&cfg.prompt_injection)
250 .context("building prompt-injection guard")?,
251 );
252 let summarizer = crate::summarizer::SummarizerService::new(
253 db.clone(),
254 registry,
255 cfg.summarization.fallback_to_extractive,
256 )
257 .with_guard(guard);
258 let defaults = crate::summarizer::DefaultsHint::from_config(&cfg.summarization);
259 maybe_summarize(
260 &summarizer,
261 &defaults,
262 family,
263 result.page.extracted_md.clone(),
264 original_tokens,
265 args.max_tokens,
266 summarize_opts,
267 )
268 .await?
269 } else {
270 (result.page.extracted_md.clone(), original_tokens, false)
271 };
272
273 let meta = PageMeta {
274 url: &url,
275 canonical_url: &canonical,
276 title: result.page.title.as_deref(),
277 fetched_at: Timestamp::now(),
278 body: &body_md,
279 tokens,
280 tokenizer_name: family.as_str(),
281 description: metadata.description.as_deref(),
282 author: metadata.author.as_deref(),
283 published: metadata.published.as_deref(),
284 modified: metadata.modified.as_deref(),
285 image: metadata.image.as_deref(),
286 og_type: metadata.og_type.as_deref(),
287 language: metadata.language.as_deref(),
288 schema_types: &metadata.schema_types,
289 extraction_quality: quality,
290 summarized,
291 headless_render: result.page.render_reason.as_deref(),
292 tables_transformed: &[],
293 images_seen: 0,
294 images_downloaded: 0,
295 images_failed: 0,
296 images_processed: vec![],
297 prompt_injection: None,
298 };
299
300 let envelope = render(&meta);
301 print!("{envelope}");
302
303 if let Some(r) = &har_recorder
304 && let Err(e) = r.flush().await
305 {
306 tracing::warn!(target: "rover::fetcher", error = ?e, "har flush failed");
307 }
308
309 Ok(())
310}
311
312async fn maybe_summarize(
317 summarizer: &crate::summarizer::SummarizerService,
318 defaults: &crate::summarizer::DefaultsHint,
319 family: crate::tokenizer::Tokenizer,
320 body: String,
321 tokens: usize,
322 max_tokens: Option<usize>,
323 summarize: Option<crate::mcp::tools::fetch::InlineSummarizeArgs>,
324) -> anyhow::Result<(String, usize, bool)> {
325 let mut body = body;
326 let mut tokens = tokens;
327 let mut summarized = false;
328
329 if let Some(inline) = summarize {
331 let opts = summarizer.resolve_defaults(
332 inline.mode.map(Into::into),
333 inline.style.map(Into::into),
334 inline.target_tokens,
335 inline.focus,
336 inline.preserve.into_iter().map(Into::into).collect(),
337 inline.backend,
338 defaults,
339 );
340 body = compact_body(summarizer, &body, &opts).await?;
341 tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
342 summarized = true;
343 }
344
345 if let Some(max) = max_tokens
351 && tokens > max
352 && !summarized
353 {
354 let opts = summarizer.resolve_defaults(None, None, Some(max), None, vec![], None, defaults);
355 body = compact_body(summarizer, &body, &opts).await?;
356 tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
357 summarized = true;
358 }
359
360 Ok((body, tokens, summarized))
361}
362
363async fn compact_body(
365 summarizer: &crate::summarizer::SummarizerService,
366 body: &str,
367 opts: &crate::summarizer::backend::CompactOpts,
368) -> anyhow::Result<String> {
369 let content_hash = format!("sha256:{}", sha256_hex(body.as_bytes()));
370 let r = summarizer
371 .compact(&content_hash, body, opts)
372 .await
373 .context("summarizing extracted markdown")?;
374 Ok(r.summary_md)
375}
376
377#[cfg(test)]
378mod tests {
379 use super::*;
380 use crate::summarizer::{DefaultsHint, SummarizerService};
381 use std::sync::Arc;
382
383 fn default_config() -> crate::config::Config {
384 toml::from_str("").unwrap()
385 }
386
387 async fn service() -> (SummarizerService, DefaultsHint, tempfile::TempDir) {
388 let tmp = tempfile::tempdir().unwrap();
389 let db = Db::open(tmp.path().join("t.db")).await.unwrap();
390 let cfg = default_config();
391 let family = cfg.tokenizer.default;
392 crate::tokenizer::ensure_loaded(family).await.unwrap();
393 let registry = Arc::new(crate::summarizer::registry::build(&cfg, family).unwrap());
394 let svc = SummarizerService::new(db, registry, cfg.summarization.fallback_to_extractive);
395 let defaults = DefaultsHint::from_config(&cfg.summarization);
396 (svc, defaults, tmp)
397 }
398
399 fn long_body() -> String {
401 let mut s = String::new();
402 for i in 0..80 {
403 s.push_str(&format!(
404 "Sentence number {i} states a distinct and self-contained fact about how a rover \
405 fetches and prepares web content for an agent to reason over. "
406 ));
407 }
408 s
409 }
410
411 #[tokio::test]
412 #[allow(clippy::await_holding_lock)] async fn passthrough_when_under_budget_and_no_summarize() {
414 let _tok = crate::tokenizer::_test_mutex()
415 .lock()
416 .unwrap_or_else(|e| e.into_inner());
417 let (svc, defaults, _tmp) = service().await;
418 let family = default_config().tokenizer.default;
419 let body = "A short extracted body.".to_string();
420 let tokens = crate::tokenizer::count(&body, family).unwrap();
421 let (out, out_tokens, summarized) = maybe_summarize(
422 &svc,
423 &defaults,
424 family,
425 body.clone(),
426 tokens,
427 Some(10_000),
428 None,
429 )
430 .await
431 .unwrap();
432 assert!(!summarized, "should not summarize when under budget");
433 assert_eq!(out, body);
434 assert_eq!(out_tokens, tokens);
435 }
436
437 #[tokio::test]
438 #[allow(clippy::await_holding_lock)] async fn explicit_summarize_shrinks_body() {
440 let _tok = crate::tokenizer::_test_mutex()
441 .lock()
442 .unwrap_or_else(|e| e.into_inner());
443 let (svc, defaults, _tmp) = service().await;
444 let family = default_config().tokenizer.default;
445 let body = long_body();
446 let tokens = crate::tokenizer::count(&body, family).unwrap();
447 let inline = crate::mcp::tools::fetch::InlineSummarizeArgs {
448 target_tokens: Some(80),
449 ..Default::default()
450 };
451 let (out, out_tokens, summarized) =
452 maybe_summarize(&svc, &defaults, family, body, tokens, None, Some(inline))
453 .await
454 .unwrap();
455 assert!(summarized);
456 assert!(!out.is_empty());
457 assert!(
458 out_tokens < tokens,
459 "summary should be smaller than the original ({out_tokens} !< {tokens})"
460 );
461 }
462
463 #[tokio::test]
464 #[allow(clippy::await_holding_lock)] async fn max_tokens_auto_summarizes_over_budget() {
466 let _tok = crate::tokenizer::_test_mutex()
467 .lock()
468 .unwrap_or_else(|e| e.into_inner());
469 let (svc, defaults, _tmp) = service().await;
470 let family = default_config().tokenizer.default;
471 let body = long_body();
472 let tokens = crate::tokenizer::count(&body, family).unwrap();
473 assert!(
474 tokens > 400,
475 "fixture should exceed the budget (got {tokens})"
476 );
477 let (out, out_tokens, summarized) =
478 maybe_summarize(&svc, &defaults, family, body, tokens, Some(400), None)
479 .await
480 .unwrap();
481 assert!(summarized);
482 assert!(!out.is_empty());
483 assert!(
484 out_tokens < tokens,
485 "auto-summary should be smaller than the original ({out_tokens} !< {tokens})"
486 );
487 }
488}