1use anyhow::Context;
9use jiff::Timestamp;
10use std::path::Path;
11use url::Url;
12
13use crate::config;
14use crate::extractor::frontmatter::{PageMeta, render};
15use crate::extractor::pipeline::extract;
16use crate::fetcher::cached::{
17 CacheStatus, ExtractResult, FetchOptions, fetch_with_cache, sha256_hex,
18};
19use crate::fetcher::client::build_http_client;
20use crate::fetcher::ssrf::SsrfLevel;
21use crate::storage::Db;
22
23pub struct Args {
24 pub url: String,
25 pub force_refresh: bool,
26 pub ignore_robots: bool,
27
28 pub user_agent: Option<String>,
30
31 pub timeout_secs: Option<u64>,
33
34 pub rate_limit_rpm: Option<u32>,
35 pub per_host_concurrency: Option<u32>,
36 pub global_concurrency: Option<u32>,
37 pub max_retries: Option<u8>,
38
39 pub max_tokens: Option<usize>,
44
45 pub summarize: Option<String>,
50}
51
52pub async fn run(args: Args, config_path: Option<&Path>) -> anyhow::Result<()> {
53 let mut cfg = config::load_resolved(config_path).context("loading config")?;
54 cfg.apply_overrides(
55 args.rate_limit_rpm,
56 args.per_host_concurrency,
57 args.global_concurrency,
58 args.max_retries,
59 args.ignore_robots,
60 );
61 if let Some(ua) = args.user_agent {
65 cfg.fetch.user_agent = ua;
66 }
67 if let Some(t) = args.timeout_secs {
68 if t == 0 {
69 anyhow::bail!("--timeout-secs must be greater than 0");
70 }
71 cfg.fetch.timeout_secs = t;
72 }
73 let url = Url::parse(&args.url).context("parsing URL argument")?;
74 let level = SsrfLevel::parse(&cfg.ssrf.level)
75 .with_context(|| format!("invalid [ssrf] level `{}` in config", cfg.ssrf.level))?;
76 let ssrf_project_root = if level == SsrfLevel::Project {
77 let raw = &cfg.ssrf.project_root;
78 let resolved = std::fs::canonicalize(raw)
79 .with_context(|| format!("canonicalizing ssrf.project_root `{}`", raw.display()))?;
80 tracing::info!(
81 target: "rover::ssrf",
82 project_root = %resolved.display(),
83 "ssrf level=project; project_root resolved",
84 );
85 Some(resolved)
86 } else {
87 None
88 };
89
90 let summarize_opts: Option<crate::mcp::tools::fetch::InlineSummarizeArgs> =
93 match args.summarize.as_deref() {
94 Some(s) => Some(serde_json::from_str(s).context("parsing --summarize JSON")?),
95 None => None,
96 };
97 if matches!(args.max_tokens, Some(0)) {
98 anyhow::bail!("--max-tokens must be greater than 0");
99 }
100
101 let data_dir = crate::paths::data_dir();
102 std::fs::create_dir_all(&data_dir).context("creating data dir")?;
103 let db = Db::open(data_dir.join("rover.db"))
104 .await
105 .context("opening cache database")?;
106
107 let client = build_http_client(&cfg.fetch.user_agent, cfg.fetch.timeout());
108 let pacer = crate::fetcher::concurrency::Pacer::new(&cfg.rate_limit);
109
110 let har_recorder: Option<std::sync::Arc<crate::fetcher::har::HarRecorder>> =
114 if !cfg.debug.har_path.is_empty() {
115 let path = std::path::PathBuf::from(&cfg.debug.har_path);
116 let r = crate::fetcher::har::HarRecorder::new(path, cfg.debug.har_body_cap)
117 .with_context(|| format!("opening har file at {}", cfg.debug.har_path))?;
118 Some(std::sync::Arc::new(r))
119 } else {
120 None
121 };
122
123 let headless_mode = if cfg.headless.auto_detect_spa {
129 crate::fetcher::HeadlessMode::Auto
130 } else {
131 crate::fetcher::HeadlessMode::Off
132 };
133 #[cfg(feature = "headless")]
134 let headless: Option<std::sync::Arc<crate::fetcher::headless::HeadlessRenderer>> =
135 if !matches!(headless_mode, crate::fetcher::HeadlessMode::Off) {
136 let r = crate::fetcher::headless::HeadlessRenderer::new(&cfg.headless)
137 .await
138 .map(std::sync::Arc::new)
139 .context("launching headless renderer")?;
140 Some(r)
141 } else {
142 None
143 };
144
145 let result = fetch_with_cache(
146 &db,
147 &client,
148 &pacer,
149 &cfg.rate_limit,
150 &cfg.robots,
151 &url,
152 &cfg.cache,
153 FetchOptions {
154 force_refresh: args.force_refresh,
155 ssrf_level: level,
156 ssrf_project_root,
157 har_recorder: har_recorder.clone(),
158 ignore_robots: args.ignore_robots,
159 user_agent: cfg.fetch.user_agent.clone(),
160 #[cfg(feature = "headless")]
161 headless: headless.clone(),
162 headless_mode,
163 synchronous_revalidation: true,
169 },
170 |body, base| {
171 let extracted =
172 extract(body, Some(base)).map_err(crate::fetcher::FetcherError::Extract)?;
173 let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
174 Ok(ExtractResult {
175 title: extracted.title,
176 body_md: extracted.body_md,
177 content_hash,
178 metadata: extracted.metadata,
179 })
180 },
181 )
182 .await
183 .context("fetching URL")?;
184
185 if matches!(result.cache_status, CacheStatus::Stale { .. }) {
186 tracing::warn!(
187 target: "rover::cli::fetch",
188 url = url.as_str(),
189 "serving stale cache entry (network unavailable)"
190 );
191 }
192
193 let canonical =
194 Url::parse(&result.page.canonical_url).context("parsing canonical URL from cache row")?;
195
196 let family = cfg.tokenizer.default;
198 crate::tokenizer::ensure_loaded(family)
199 .await
200 .context("loading default tokenizer")?;
201 let original_tokens = crate::tokenizer::count(&result.page.extracted_md, family)
202 .context("counting tokens for frontmatter")?;
203
204 let metadata: crate::extractor::ExtractedMetadata = result
208 .page
209 .metadata_json
210 .as_deref()
211 .and_then(|s| serde_json::from_str(s).ok())
212 .unwrap_or_default();
213 let quality = crate::extractor::quality::score(
216 &result.page.extracted_md,
217 result.page.extracted_md.chars().count().max(1),
218 !metadata.is_empty(),
219 result.page.title.is_some(),
220 );
221
222 let (body_md, tokens, summarized) = if args.max_tokens.is_some() || summarize_opts.is_some() {
226 let registry = std::sync::Arc::new(
227 crate::summarizer::registry::build(&cfg, family)
228 .context("building summarizer registry")?,
229 );
230 let guard = std::sync::Arc::new(
234 crate::guard::Guard::from_config(&cfg.prompt_injection)
235 .context("building prompt-injection guard")?,
236 );
237 let summarizer = crate::summarizer::SummarizerService::new(
238 db.clone(),
239 registry,
240 cfg.summarization.fallback_to_extractive,
241 )
242 .with_guard(guard);
243 let defaults = crate::summarizer::DefaultsHint::from_config(&cfg.summarization);
244 maybe_summarize(
245 &summarizer,
246 &defaults,
247 family,
248 result.page.extracted_md.clone(),
249 original_tokens,
250 args.max_tokens,
251 summarize_opts,
252 )
253 .await?
254 } else {
255 (result.page.extracted_md.clone(), original_tokens, false)
256 };
257
258 let meta = PageMeta {
259 url: &url,
260 canonical_url: &canonical,
261 title: result.page.title.as_deref(),
262 fetched_at: Timestamp::now(),
263 body: &body_md,
264 tokens,
265 tokenizer_name: family.as_str(),
266 description: metadata.description.as_deref(),
267 author: metadata.author.as_deref(),
268 published: metadata.published.as_deref(),
269 modified: metadata.modified.as_deref(),
270 image: metadata.image.as_deref(),
271 og_type: metadata.og_type.as_deref(),
272 language: metadata.language.as_deref(),
273 schema_types: &metadata.schema_types,
274 extraction_quality: quality,
275 summarized,
276 tables_transformed: &[],
277 images_seen: 0,
278 images_downloaded: 0,
279 images_failed: 0,
280 images_processed: vec![],
281 prompt_injection: None,
282 };
283
284 let envelope = render(&meta);
285 print!("{envelope}");
286
287 if let Some(r) = &har_recorder
288 && let Err(e) = r.flush().await
289 {
290 tracing::warn!(target: "rover::fetcher", error = ?e, "har flush failed");
291 }
292
293 #[cfg(feature = "headless")]
298 if let Some(renderer) = headless {
299 match std::sync::Arc::try_unwrap(renderer) {
300 Ok(r) => r.shutdown().await,
301 Err(_still_shared) => {
302 tracing::warn!(
303 target: "rover::cli::fetch",
304 "headless renderer still has outstanding Arc references at shutdown; skipping explicit shutdown",
305 );
306 }
307 }
308 }
309
310 Ok(())
311}
312
313async fn maybe_summarize(
318 summarizer: &crate::summarizer::SummarizerService,
319 defaults: &crate::summarizer::DefaultsHint,
320 family: crate::tokenizer::Tokenizer,
321 body: String,
322 tokens: usize,
323 max_tokens: Option<usize>,
324 summarize: Option<crate::mcp::tools::fetch::InlineSummarizeArgs>,
325) -> anyhow::Result<(String, usize, bool)> {
326 let mut body = body;
327 let mut tokens = tokens;
328 let mut summarized = false;
329
330 if let Some(inline) = summarize {
332 let opts = summarizer.resolve_defaults(
333 inline.mode.map(Into::into),
334 inline.style.map(Into::into),
335 inline.target_tokens,
336 inline.focus,
337 inline.preserve.into_iter().map(Into::into).collect(),
338 inline.backend,
339 defaults,
340 );
341 body = compact_body(summarizer, &body, &opts).await?;
342 tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
343 summarized = true;
344 }
345
346 if let Some(max) = max_tokens
352 && tokens > max
353 && !summarized
354 {
355 let opts = summarizer.resolve_defaults(None, None, Some(max), None, vec![], None, defaults);
356 body = compact_body(summarizer, &body, &opts).await?;
357 tokens = crate::tokenizer::count(&body, family).context("counting summary tokens")?;
358 summarized = true;
359 }
360
361 Ok((body, tokens, summarized))
362}
363
364async fn compact_body(
366 summarizer: &crate::summarizer::SummarizerService,
367 body: &str,
368 opts: &crate::summarizer::backend::CompactOpts,
369) -> anyhow::Result<String> {
370 let content_hash = format!("sha256:{}", sha256_hex(body.as_bytes()));
371 let r = summarizer
372 .compact(&content_hash, body, opts)
373 .await
374 .context("summarizing extracted markdown")?;
375 Ok(r.summary_md)
376}
377
378#[cfg(test)]
379mod tests {
380 use super::*;
381 use crate::summarizer::{DefaultsHint, SummarizerService};
382 use std::sync::Arc;
383
384 fn default_config() -> crate::config::Config {
385 toml::from_str("").unwrap()
386 }
387
388 async fn service() -> (SummarizerService, DefaultsHint, tempfile::TempDir) {
389 let tmp = tempfile::tempdir().unwrap();
390 let db = Db::open(tmp.path().join("t.db")).await.unwrap();
391 let cfg = default_config();
392 let family = cfg.tokenizer.default;
393 crate::tokenizer::ensure_loaded(family).await.unwrap();
394 let registry = Arc::new(crate::summarizer::registry::build(&cfg, family).unwrap());
395 let svc = SummarizerService::new(db, registry, cfg.summarization.fallback_to_extractive);
396 let defaults = DefaultsHint::from_config(&cfg.summarization);
397 (svc, defaults, tmp)
398 }
399
400 fn long_body() -> String {
402 let mut s = String::new();
403 for i in 0..80 {
404 s.push_str(&format!(
405 "Sentence number {i} states a distinct and self-contained fact about how a rover \
406 fetches and prepares web content for an agent to reason over. "
407 ));
408 }
409 s
410 }
411
412 #[tokio::test]
413 #[allow(clippy::await_holding_lock)] async fn passthrough_when_under_budget_and_no_summarize() {
415 let _tok = crate::tokenizer::_test_mutex()
416 .lock()
417 .unwrap_or_else(|e| e.into_inner());
418 let (svc, defaults, _tmp) = service().await;
419 let family = default_config().tokenizer.default;
420 let body = "A short extracted body.".to_string();
421 let tokens = crate::tokenizer::count(&body, family).unwrap();
422 let (out, out_tokens, summarized) = maybe_summarize(
423 &svc,
424 &defaults,
425 family,
426 body.clone(),
427 tokens,
428 Some(10_000),
429 None,
430 )
431 .await
432 .unwrap();
433 assert!(!summarized, "should not summarize when under budget");
434 assert_eq!(out, body);
435 assert_eq!(out_tokens, tokens);
436 }
437
438 #[tokio::test]
439 #[allow(clippy::await_holding_lock)] async fn explicit_summarize_shrinks_body() {
441 let _tok = crate::tokenizer::_test_mutex()
442 .lock()
443 .unwrap_or_else(|e| e.into_inner());
444 let (svc, defaults, _tmp) = service().await;
445 let family = default_config().tokenizer.default;
446 let body = long_body();
447 let tokens = crate::tokenizer::count(&body, family).unwrap();
448 let inline = crate::mcp::tools::fetch::InlineSummarizeArgs {
449 target_tokens: Some(80),
450 ..Default::default()
451 };
452 let (out, out_tokens, summarized) =
453 maybe_summarize(&svc, &defaults, family, body, tokens, None, Some(inline))
454 .await
455 .unwrap();
456 assert!(summarized);
457 assert!(!out.is_empty());
458 assert!(
459 out_tokens < tokens,
460 "summary should be smaller than the original ({out_tokens} !< {tokens})"
461 );
462 }
463
464 #[tokio::test]
465 #[allow(clippy::await_holding_lock)] async fn max_tokens_auto_summarizes_over_budget() {
467 let _tok = crate::tokenizer::_test_mutex()
468 .lock()
469 .unwrap_or_else(|e| e.into_inner());
470 let (svc, defaults, _tmp) = service().await;
471 let family = default_config().tokenizer.default;
472 let body = long_body();
473 let tokens = crate::tokenizer::count(&body, family).unwrap();
474 assert!(
475 tokens > 400,
476 "fixture should exceed the budget (got {tokens})"
477 );
478 let (out, out_tokens, summarized) =
479 maybe_summarize(&svc, &defaults, family, body, tokens, Some(400), None)
480 .await
481 .unwrap();
482 assert!(summarized);
483 assert!(!out.is_empty());
484 assert!(
485 out_tokens < tokens,
486 "auto-summary should be smaller than the original ({out_tokens} !< {tokens})"
487 );
488 }
489}