Skip to main content

rover/fetcher/
cached.rs

1//! Cache-aware fetch orchestrator.
2//!
3//! `fetch_with_cache` is the high-level entry point used by the CLI and the
4//! (future) MCP `fetch` tool. It wraps the raw `fetcher::fetch::fetch_url`
5//! with cache lookup, TTL-driven freshness, and write-back.
6//!
7//! Task 7 shipped the orchestrator skeleton (always a full GET on miss/stale).
8//! Task 8 added conditional GETs (`If-None-Match` / `If-Modified-Since`),
9//! 304 Not Modified handling via `pages::touch`, and real `Cache-Control` /
10//! `Expires` header propagation into the TTL decision.
11
12use jiff::Timestamp;
13use sha2::{Digest, Sha256};
14use url::Url;
15
16use super::FetcherError;
17use super::fetch::ConditionalGet;
18use super::ssrf::SsrfLevel;
19use super::ttl::{TtlDecision, compute_ttl};
20use crate::config::CacheConfig;
21use crate::extractor::metadata::ExtractedMetadata;
22use crate::storage::Db;
23use crate::storage::pages::{self, Page, url_hash};
24
25/// Outcome of a cache-aware fetch.
26///
27/// `Stale` carries the id of the `revalidate` task that was enqueued when
28/// the SWR fast-path (M6) returned the expired row. `None` means the row
29/// was served stale but the task insert failed (logged; not fatal).
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum CacheStatus {
32    Hit,
33    Stale {
34        revalidation_task_id: Option<String>,
35    },
36    Miss,
37}
38
39/// What `fetch_with_cache` returns: a Page (cache hit/miss/stale) plus the
40/// cache_status that produced it. The Page mirrors the storage row so the
41/// caller has both extracted_md and metadata available.
42#[derive(Debug, Clone)]
43pub struct CachedFetch {
44    pub page: Page,
45    pub cache_status: CacheStatus,
46}
47
48/// Per-call headless mode selection.
49///
50/// Defined here (not behind `#[cfg(feature = "headless")]`) so every call site
51/// can use `HeadlessMode::Off` without conditional compilation. The headless
52/// module's own `HeadlessMode` (in `src/fetcher/headless/mod.rs`) is the same
53/// shape and is interconvertible via `as_str`.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
55pub enum HeadlessMode {
56    /// Never use the headless renderer (default).
57    #[default]
58    Off,
59    /// Always use the headless renderer.
60    On,
61    /// Use the headless renderer only when SPA heuristics trigger.
62    Auto,
63}
64
65#[derive(Debug, Clone)]
66pub struct FetchOptions {
67    pub force_refresh: bool,
68    pub ssrf_level: SsrfLevel,
69    /// Required (some) when `ssrf_level == Project`. Must be pre-canonicalized.
70    pub ssrf_project_root: Option<std::path::PathBuf>,
71    /// Optional HAR recorder. When `Some`, every round-trip is recorded.
72    pub har_recorder: Option<std::sync::Arc<crate::fetcher::har::HarRecorder>>,
73    /// When `true`, skip the robots gate. Used by `--ignore-robots`.
74    pub ignore_robots: bool,
75    /// User-Agent used for robots.txt UA-rule evaluation. Must match
76    /// `[fetch] user_agent`.
77    pub user_agent: String,
78    /// M9: lazily-initialized headless renderer handle (`Some` when the binary
79    /// was built with `--features headless` AND the caller wired one). The
80    /// browser is launched on first use inside `fetch_with_cache` — only when a
81    /// render actually happens (SPA detected, bot-challenge bypass, or
82    /// `On` mode), never for a plain reqwest fetch.
83    #[cfg(feature = "headless")]
84    pub headless: Option<crate::fetcher::headless::HeadlessHandle>,
85    /// M9: per-call mode selection.
86    pub headless_mode: HeadlessMode,
87    /// When `true`, the caller opts out of the stale-while-revalidate
88    /// fast-path: on an expired cache entry, `fetch_with_cache` performs
89    /// the network refresh inline rather than serving stale and queueing
90    /// a background `revalidate` task.
91    ///
92    /// Set this from any caller that does NOT have a running task
93    /// scheduler in the same process — chiefly the one-shot CLI. The
94    /// MCP server's tools leave this `false` so the agent gets a fast
95    /// response and the in-process scheduler refreshes the row.
96    ///
97    /// Independently of this flag, the row is also re-fetched
98    /// synchronously when the row expired more than
99    /// `[cache] stale_while_revalidate_window` ago, so callers never
100    /// receive arbitrarily old content.
101    pub synchronous_revalidation: bool,
102}
103
104/// What `fetch_with_cache` needs from the extractor. Defined here as a tiny
105/// adapter so the extractor module isn't a hard dependency of the fetcher.
106#[derive(Debug, Clone)]
107pub struct ExtractResult {
108    pub title: Option<String>,
109    pub body_md: String,
110    pub content_hash: String,
111    pub metadata: ExtractedMetadata,
112}
113
114/// Cache-aware fetch entry point.
115///
116/// The extraction step is delegated to `extract_fn`: this keeps the fetcher
117/// independent of the extractor module. The CLI/MCP layer wires up
118/// `extractor::pipeline::extract`.
119#[allow(clippy::too_many_arguments)]
120pub async fn fetch_with_cache<F>(
121    db: &Db,
122    client: &reqwest::Client,
123    pacer: &crate::fetcher::concurrency::Pacer,
124    rate_cfg: &crate::config::RateLimitConfig,
125    robots_cfg: &crate::config::RobotsConfig,
126    url: &Url,
127    cache_cfg: &CacheConfig,
128    opts: FetchOptions,
129    mut extract_fn: F,
130) -> Result<CachedFetch, FetcherError>
131where
132    F: FnMut(&str, &Url) -> Result<ExtractResult, FetcherError>,
133{
134    let now = Timestamp::now().as_second();
135
136    let host = url
137        .host_str()
138        .ok_or(FetcherError::Ssrf(crate::fetcher::ssrf::SsrfError::NoHost))?;
139
140    // Robots gate (M5). Skipped when explicitly disabled or for ignore_domains.
141    let robots_skipped = !robots_cfg.respect
142        || opts.ignore_robots
143        || robots_cfg.ignore_domains.iter().any(|d| d == host);
144    let crawl_delay: Option<std::time::Duration> = if robots_skipped {
145        None
146    } else {
147        let entry = crate::fetcher::robots::ensure_entry(
148            db,
149            pacer,
150            client,
151            robots_cfg,
152            host,
153            opts.ssrf_level,
154            &opts.user_agent,
155            rate_cfg,
156        )
157        .await?;
158
159        let verdict = crate::fetcher::robots::evaluate(&entry, &opts.user_agent, url.path());
160        if matches!(verdict, crate::fetcher::robots::Verdict::Disallowed) {
161            return Err(FetcherError::RobotsDisallowed {
162                url: url.to_string(),
163                ua: opts.user_agent.clone(),
164            });
165        }
166        crate::fetcher::robots::crawl_delay(&entry, &opts.user_agent)
167    };
168
169    // Step 1: cache lookup.
170    //
171    // Three outcomes for an expired row:
172    //  1. Fresh hit (`expires_at > now`) — return immediately.
173    //  2. Expired within the SWR grace window AND caller hasn't opted out
174    //     of SWR → serve stale now, queue a `revalidate` task in the
175    //     background. The agent monitors the task id; the row gets
176    //     refreshed out-of-band.
177    //  3. Expired beyond the grace window, OR caller asked for
178    //     synchronous behaviour (e.g. CLI) → fall through to the network
179    //     refresh path, keeping the stale row threaded down for the
180    //     conditional-GET validators in Step 2.
181    //
182    // The grace window stops the SWR path from ever returning arbitrarily
183    // old content; without it, an entry that expired weeks ago would still
184    // be served stale on every fetch (because nothing was refreshing it).
185    let swr_window_secs = cache_cfg.stale_while_revalidate_window.as_secs() as i64;
186    let stale: Option<Page> = if opts.force_refresh {
187        None
188    } else {
189        match lookup_cached(db, url).await? {
190            Some(p) if p.expires_at.is_some_and(|e| e > now) => {
191                return Ok(CachedFetch {
192                    page: p,
193                    cache_status: CacheStatus::Hit,
194                });
195            }
196            Some(p) => {
197                let within_swr_window = p
198                    .expires_at
199                    .is_some_and(|e| now.saturating_sub(e) <= swr_window_secs);
200                if within_swr_window && !opts.synchronous_revalidation {
201                    // SWR fast-path: queue a revalidate task, return stale now.
202                    let task_id = insert_revalidate_task(db, url, &p).await;
203                    return Ok(CachedFetch {
204                        page: p,
205                        cache_status: CacheStatus::Stale {
206                            revalidation_task_id: task_id,
207                        },
208                    });
209                }
210                // Treat as a miss. The stale row is kept around so Step 2
211                // can build conditional validators from it.
212                Some(p)
213            }
214            None => None,
215        }
216    };
217
218    // Step 2: build conditional validators from any stale entry. `stale`
219    // is Some when we're synchronously revalidating an expired row (either
220    // because the caller opted out of SWR or the row expired beyond the
221    // grace window) — in that case we forward `If-None-Match` /
222    // `If-Modified-Since` so a 304 lets us extend the freshness on the
223    // existing row instead of re-extracting.
224    let cond = match &stale {
225        Some(p) => ConditionalGet {
226            if_none_match: p.etag.clone(),
227            if_modified_since: p.last_modified.clone(),
228        },
229        None => ConditionalGet::default(),
230    };
231
232    // Step 3: fetch (conditional if validators present).
233    //
234    // M9 mode dispatch:
235    //   - `Off` (default): today's reqwest path, unchanged.
236    //   - `Auto`: reqwest path now, optional headless re-render after extract.
237    //   - `On`: bypass reqwest entirely; render via the headless browser.
238    //
239    // The `On` branch synthesizes a `FetchedPage`-shaped value from the
240    // renderer output so step 6 (TTL) and step 7 (store) work unchanged.
241    // Track *why* the headless renderer was used, if at all, so the stored row
242    // (and the fetch frontmatter) records how this content was obtained. Set at
243    // each render site below; `None` means a plain HTTP fetch.
244    #[cfg(feature = "headless")]
245    let mut render_reason: Option<&'static str> = None;
246
247    let fetched = match opts.headless_mode {
248        HeadlessMode::Off | HeadlessMode::Auto => {
249            let retry_result = crate::fetcher::retry::with_retries(
250                db,
251                pacer,
252                client,
253                url,
254                opts.ssrf_level,
255                opts.ssrf_project_root.as_deref(),
256                opts.har_recorder.as_ref(),
257                &cond,
258                crawl_delay,
259                rate_cfg,
260            )
261            .await;
262
263            // Bot-challenge bypass (Auto mode only): a managed challenge
264            // (Vercel/Cloudflare) returns no usable content to a plain HTTP
265            // client, but the headless browser executes the JS challenge like a
266            // real browser and reaches the page. Upgrade the challenge error
267            // into a headless render here; on bypass failure, reconstruct the
268            // original challenge error so the stale/propagate logic below is
269            // unchanged.
270            #[cfg(feature = "headless")]
271            let retry_result = match retry_result {
272                Err(FetcherError::BotChallenge {
273                    url: ch_url,
274                    provider,
275                }) if opts.headless_mode == HeadlessMode::Auto && opts.headless.is_some() => {
276                    let handle = opts.headless.as_ref().expect("guarded by is_some()");
277                    auto_render_delay(handle, url, "bot_challenge_bypass").await;
278                    match handle.get().await {
279                        Ok(r) => match r
280                            .render(url, opts.ssrf_level, opts.ssrf_project_root.as_deref())
281                            .await
282                        {
283                            Ok(rendered) => {
284                                tracing::info!(target: "rover::fetcher::cached",
285                                    url = url.as_str(), provider = %provider,
286                                    "bot-protection challenge on HTTP fetch; bypassed via headless render");
287                                render_reason = Some("bot_challenge");
288                                Ok(rendered_to_fetched(rendered))
289                            }
290                            Err(render_err) => {
291                                tracing::warn!(target: "rover::fetcher::cached",
292                                    error = %render_err, url = url.as_str(), provider = %provider,
293                                    "headless bypass of bot-protection challenge failed; returning challenge error");
294                                Err(FetcherError::BotChallenge {
295                                    url: ch_url,
296                                    provider,
297                                })
298                            }
299                        },
300                        Err(launch_err) => {
301                            tracing::warn!(target: "rover::fetcher::cached",
302                                error = %launch_err, url = url.as_str(), provider = %provider,
303                                "could not launch headless renderer to bypass bot-protection challenge; returning challenge error");
304                            Err(FetcherError::BotChallenge {
305                                url: ch_url,
306                                provider,
307                            })
308                        }
309                    }
310                }
311                other => other,
312            };
313
314            match retry_result {
315                Ok(f) => f,
316                Err(e) => {
317                    // Network failure with a stale entry available. Serve the
318                    // stale row only if it's still within the SWR grace
319                    // window — beyond that we'd rather propagate the error
320                    // than misrepresent very old content as a successful
321                    // fetch. Caller can retry with `--force-refresh` or wait
322                    // for the upstream to recover.
323                    if let Some(s) = stale {
324                        let within_window = s
325                            .expires_at
326                            .is_some_and(|exp| now.saturating_sub(exp) <= swr_window_secs);
327                        if within_window {
328                            tracing::warn!(target: "rover::fetcher::cached",
329                                error = %e, url = url.as_str(), "fetch failed; serving stale within SWR window");
330                            let task_id = insert_revalidate_task(db, url, &s).await;
331                            return Ok(CachedFetch {
332                                page: s,
333                                cache_status: CacheStatus::Stale {
334                                    revalidation_task_id: task_id,
335                                },
336                            });
337                        }
338                        tracing::warn!(target: "rover::fetcher::cached",
339                            error = %e, url = url.as_str(),
340                            "fetch failed; stale entry is beyond SWR window — propagating error rather than serving very old content");
341                    }
342                    return Err(e);
343                }
344            }
345        }
346        HeadlessMode::On => {
347            #[cfg(not(feature = "headless"))]
348            {
349                return Err(FetcherError::HeadlessFeatureNotCompiled);
350            }
351            #[cfg(feature = "headless")]
352            {
353                let r = opts
354                    .headless
355                    .as_ref()
356                    .ok_or(FetcherError::HeadlessRendererUnavailable)?
357                    .get()
358                    .await?;
359                let rendered = r
360                    .render(url, opts.ssrf_level, opts.ssrf_project_root.as_deref())
361                    .await?;
362                render_reason = Some("on");
363                rendered_to_fetched(rendered)
364            }
365        }
366    };
367
368    // Step 4: 304 Not Modified — extend freshness on the stale row and serve it.
369    // With M6 SWR, conditional GETs are issued only when `force_refresh = true`
370    // *and* a stale row was somehow threaded through; the no-force_refresh path
371    // returns stale early. This block remains as a safety net.
372    //
373    // 304 is only possible via the reqwest path; the headless render
374    // synthesizes status=200 so this branch is naturally skipped for `On`.
375    if fetched.status == 304 {
376        let stale = stale.expect("304 implies a stale entry was sent");
377        let decision = compute_ttl(
378            now,
379            host,
380            fetched.cache_control.as_deref().unwrap_or(""),
381            fetched.expires.as_deref(),
382            cache_cfg,
383        );
384        let expires_at = match decision {
385            TtlDecision::Cache { expires_at } => Some(expires_at),
386            TtlDecision::DoNotCache => None,
387        };
388        pages::touch(db, &stale.url_hash, now, expires_at)
389            .await
390            .map_err(map_storage_err)?;
391        let mut page = stale;
392        page.fetched_at = now;
393        page.expires_at = expires_at;
394        return Ok(CachedFetch {
395            page,
396            cache_status: CacheStatus::Hit,
397        });
398    }
399
400    if !(200..300).contains(&fetched.status) {
401        return Err(FetcherError::Status {
402            status: fetched.status,
403            url: fetched.final_url.to_string(),
404        });
405    }
406
407    // Step 5: extract.
408    let extracted = extract_fn(&fetched.body, &fetched.final_url)?;
409
410    // M9 Auto-mode SPA heuristic: if the reqwest result looks like an
411    // unrendered SPA (`detect_spa(...).total >= 2`), re-render via headless
412    // and re-extract. When the feature is compiled out or the renderer isn't
413    // wired, we silently keep the reqwest extraction.
414    let (fetched, extracted) = if opts.headless_mode == HeadlessMode::Auto {
415        #[cfg(feature = "headless")]
416        {
417            if let Some(h) = opts.headless.as_ref() {
418                let hits =
419                    crate::fetcher::headless::detect::detect_spa(&fetched.body, &extracted.body_md);
420                if hits.total >= 2 {
421                    auto_render_delay(h, url, "spa_rerender").await;
422                    let r = h.get().await?;
423                    render_reason = Some("spa");
424                    let rendered = r
425                        .render(url, opts.ssrf_level, opts.ssrf_project_root.as_deref())
426                        .await?;
427                    let f2 = rendered_to_fetched(rendered);
428                    let e2 = extract_fn(&f2.body, &f2.final_url)?;
429                    (f2, e2)
430                } else {
431                    (fetched, extracted)
432                }
433            } else {
434                (fetched, extracted)
435            }
436        }
437        #[cfg(not(feature = "headless"))]
438        {
439            (fetched, extracted)
440        }
441    } else {
442        (fetched, extracted)
443    };
444
445    // Step 6: TTL from real Cache-Control / Expires headers.
446    let decision = compute_ttl(
447        now,
448        host,
449        fetched.cache_control.as_deref().unwrap_or(""),
450        fetched.expires.as_deref(),
451        cache_cfg,
452    );
453
454    let expires_at = match decision {
455        TtlDecision::Cache { expires_at } => Some(expires_at),
456        TtlDecision::DoNotCache => None,
457    };
458
459    let new_hash = url_hash(fetched.canonical_url.as_str());
460    let metadata_json = serde_json::to_string(&extracted.metadata).ok();
461    // Only retain the raw body when the operator opted in via `[cache]
462    // store_raw_html`. We clone the decoded UTF-8 body's bytes: the cost is
463    // proportional to the page size, but only paid on the fresh-fetch path.
464    let raw_html = if cache_cfg.store_raw_html {
465        Some(fetched.body.as_bytes().to_vec())
466    } else {
467        None
468    };
469    // Resolve the tracked headless reason into the persisted column. In a
470    // non-headless build nothing can render, so it is always `None`.
471    #[cfg(feature = "headless")]
472    let render_reason = render_reason.map(str::to_owned);
473    #[cfg(not(feature = "headless"))]
474    let render_reason: Option<String> = None;
475    let page = Page {
476        url_hash: new_hash,
477        url: url.as_str().to_owned(),
478        canonical_url: fetched.canonical_url.as_str().to_owned(),
479        title: extracted.title.clone(),
480        fetched_at: now,
481        expires_at,
482        etag: fetched.etag.clone(),
483        last_modified: fetched.last_modified.clone(),
484        content_hash: extracted.content_hash.clone(),
485        extracted_md: extracted.body_md.clone(),
486        metadata_json,
487        raw_html,
488        render_reason,
489    };
490
491    // Step 7: store (only if cacheable).
492    if expires_at.is_some() {
493        pages::upsert(db, page.clone())
494            .await
495            .map_err(map_storage_err)?;
496    }
497
498    Ok(CachedFetch {
499        page,
500        cache_status: CacheStatus::Miss,
501    })
502}
503
504/// Convert a `RenderedPage` (from the headless renderer) into a `FetchedPage`
505/// so the rest of `fetch_with_cache` (TTL → store) can treat it uniformly.
506///
507/// Synthesizes empty cache headers, no ETag, no Last-Modified. The TTL
508/// computation will therefore fall through to the default-TTL policy. The
509/// canonical URL is resolved from the rendered DOM (`<link rel="canonical">`)
510/// or falls back to the final URL.
511/// Apply the configured Auto-mode pre-render delay before escalating to the
512/// headless browser. Runs *after* the render trigger has been detected (SPA
513/// heuristic fired, or a bot-challenge was returned) and *before* the browser
514/// is launched/driven, giving the origin a breather between the lightweight
515/// HTTP fetch and the heavier browser hit. A no-op when configured to zero.
516#[cfg(feature = "headless")]
517async fn auto_render_delay(
518    handle: &crate::fetcher::headless::HeadlessHandle,
519    url: &url::Url,
520    reason: &'static str,
521) {
522    let delay = handle.launch_delay();
523    if delay.is_zero() {
524        return;
525    }
526    tracing::debug!(
527        target: "rover::fetcher::cached",
528        url = url.as_str(),
529        delay_secs = delay.as_secs(),
530        reason,
531        "Auto-mode pre-render delay before headless launch",
532    );
533    tokio::time::sleep(delay).await;
534}
535
536#[cfg(feature = "headless")]
537fn rendered_to_fetched(
538    rendered: crate::fetcher::headless::RenderedPage,
539) -> crate::fetcher::FetchedPage {
540    use crate::fetcher::FetchedPage;
541    use crate::fetcher::canonical::extract_canonical_url;
542    use crate::fetcher::charset::Detected;
543
544    let canonical_url = extract_canonical_url(&rendered.html, &rendered.final_url, None);
545    FetchedPage {
546        final_url: rendered.final_url,
547        canonical_url,
548        status: rendered.status,
549        content_type: Some("text/html; charset=utf-8".to_string()),
550        body: rendered.html,
551        charset: Detected::default(),
552        link_header: None,
553        etag: None,
554        last_modified: None,
555        cache_control: None,
556        expires: None,
557        retry_after: None,
558    }
559}
560
561/// Compute sha256 hex of bytes. Centralized here so callers don't have to
562/// pull in `sha2` directly.
563pub fn sha256_hex(bytes: &[u8]) -> String {
564    let mut h = Sha256::new();
565    h.update(bytes);
566    let out = h.finalize();
567    let mut s = String::with_capacity(out.len() * 2);
568    for b in out {
569        use std::fmt::Write as _;
570        write!(s, "{b:02x}").expect("write to String never fails");
571    }
572    s
573}
574
575async fn lookup_cached(db: &Db, url: &Url) -> Result<Option<Page>, FetcherError> {
576    let hash = url_hash(url.as_str());
577    if let Some(p) = pages::get_by_url_hash(db, &hash)
578        .await
579        .map_err(map_storage_err)?
580    {
581        return Ok(Some(p));
582    }
583    pages::get_by_url(db, url.as_str())
584        .await
585        .map_err(map_storage_err)
586}
587
588fn map_storage_err(e: crate::storage::StorageError) -> FetcherError {
589    tracing::error!(target: "rover::fetcher::cached", error = %e, "storage error");
590    FetcherError::Storage(e)
591}
592
593/// Enqueue a `revalidate` task for an expired cache row. Returns the task id
594/// on success. Failures are logged and swallowed: a stale-served response is
595/// still a useful answer to the agent, and the worker will re-enqueue on the
596/// next miss.
597async fn insert_revalidate_task(db: &Db, url: &Url, stale: &Page) -> Option<String> {
598    use crate::storage::tasks::{TaskInsert, TaskKind, insert};
599    let params = serde_json::to_string(&crate::tasks::types::RevalidateParams {
600        url: url.to_string(),
601        etag_at_serve: stale.etag.clone(),
602        last_modified_at_serve: stale.last_modified.clone(),
603    })
604    .ok()?;
605    let id = uuid::Uuid::now_v7().to_string();
606    match insert(
607        db,
608        TaskInsert {
609            id: id.clone(),
610            kind: TaskKind::Revalidate,
611            params_json: params,
612            owner_pid: Some(std::process::id() as i64),
613        },
614    )
615    .await
616    {
617        Ok(()) => Some(id),
618        Err(e) => {
619            tracing::warn!(
620                target: "rover::fetcher::cached",
621                error = %e,
622                url = url.as_str(),
623                "failed to enqueue revalidate task; serving stale without revalidation",
624            );
625            None
626        }
627    }
628}
629
630#[cfg(test)]
631mod tests {
632    use super::*;
633
634    #[test]
635    fn cache_status_eq() {
636        assert_ne!(
637            CacheStatus::Hit,
638            CacheStatus::Stale {
639                revalidation_task_id: None
640            }
641        );
642    }
643
644    #[test]
645    fn map_storage_err_routes_to_storage_variant() {
646        // Regression: previously collapsed every StorageError into FetcherError::Decode,
647        // producing the misleading "response decoding failed" message for DB failures.
648        let storage_err = crate::storage::StorageError::from(rusqlite::Error::QueryReturnedNoRows);
649        let mapped = map_storage_err(storage_err);
650        assert!(matches!(mapped, FetcherError::Storage(_)));
651        assert!(mapped.to_string().starts_with("storage error:"));
652    }
653
654    #[test]
655    fn sha256_hex_matches_known() {
656        assert_eq!(
657            sha256_hex(b""),
658            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
659        );
660    }
661
662    #[tokio::test]
663    async fn cache_hit_within_ttl() {
664        use crate::config::{RateLimitConfig, RobotsConfig};
665        use crate::fetcher::concurrency::Pacer;
666        use crate::storage::Db;
667        use std::time::Duration;
668        use tempfile::tempdir;
669        let tmp = tempdir().unwrap();
670        let db = Db::open(tmp.path().join("rover.db")).await.unwrap();
671        let url = Url::parse("https://example.com/").unwrap();
672        let now = Timestamp::now().as_second();
673        let page = Page {
674            url_hash: url_hash(url.as_str()),
675            url: url.to_string(),
676            canonical_url: url.to_string(),
677            title: Some("cached".into()),
678            fetched_at: now - 60,
679            expires_at: Some(now + 600),
680            etag: None,
681            last_modified: None,
682            content_hash: "x".into(),
683            extracted_md: "# cached".into(),
684            metadata_json: None,
685            raw_html: None,
686            render_reason: None,
687        };
688        pages::upsert(&db, page.clone()).await.unwrap();
689
690        let cache_cfg = CacheConfig {
691            default_ttl: Duration::from_secs(3600),
692            min_ttl: Duration::from_secs(60),
693            max_ttl: Duration::from_secs(86400),
694            stale_while_revalidate_window: Duration::from_secs(300),
695            override_no_store: false,
696            override_no_store_domains: vec![],
697            store_raw_html: false,
698        };
699        let rate_cfg = RateLimitConfig::default();
700        // avoid robots fetch in this unit test
701        let robots_cfg = RobotsConfig {
702            respect: false,
703            ..RobotsConfig::default()
704        };
705        let pacer = Pacer::new(&rate_cfg);
706        let client = super::super::client::build_http_client("test/0.1", Duration::from_secs(5));
707        let result = fetch_with_cache(
708            &db,
709            &client,
710            &pacer,
711            &rate_cfg,
712            &robots_cfg,
713            &url,
714            &cache_cfg,
715            FetchOptions {
716                force_refresh: false,
717                ssrf_level: SsrfLevel::Strict,
718                ssrf_project_root: None,
719                har_recorder: None,
720                ignore_robots: false,
721                user_agent: "test/0.1".into(),
722                #[cfg(feature = "headless")]
723                headless: None,
724                headless_mode: HeadlessMode::Off,
725                synchronous_revalidation: false,
726            },
727            |_, _| {
728                panic!("extract_fn must not be called on cache hit");
729            },
730        )
731        .await
732        .unwrap();
733        assert_eq!(result.cache_status, CacheStatus::Hit);
734        assert_eq!(result.page.title.as_deref(), Some("cached"));
735    }
736
737    // The remaining tests in this module exercise the three new corners of
738    // the SWR / synchronous-revalidation decision matrix introduced
739    // alongside the `stale_while_revalidate_window` config:
740    //
741    //   - expired within window, sync flag off  → SWR (return stale, queue task)
742    //   - expired beyond window, sync flag off  → fall through to sync fetch
743    //   - expired within window, sync flag on   → fall through to sync fetch
744    //
745    // The "cache hit" arm is covered by `cache_hit_within_ttl` above. The
746    // "force_refresh" arm is exercised by tests/fetcher_full_loop.rs and
747    // tests/fetcher_retry.rs and is unchanged by this branch.
748
749    #[cfg(any(test, feature = "test-loopback"))]
750    async fn build_swr_test_fixture(
751        swr_window: std::time::Duration,
752    ) -> (
753        crate::storage::Db,
754        Url,
755        crate::config::CacheConfig,
756        crate::config::RateLimitConfig,
757        crate::config::RobotsConfig,
758        crate::fetcher::concurrency::Pacer,
759        reqwest::Client,
760        tempfile::TempDir,
761    ) {
762        use crate::config::{RateLimitConfig, RobotsConfig};
763        use crate::fetcher::concurrency::Pacer;
764        use crate::storage::Db;
765        use std::time::Duration;
766        let tmp = tempfile::tempdir().unwrap();
767        let db = Db::open(tmp.path().join("rover.db")).await.unwrap();
768        let cache_cfg = CacheConfig {
769            default_ttl: Duration::from_secs(3600),
770            min_ttl: Duration::from_secs(0),
771            max_ttl: Duration::from_secs(86400),
772            stale_while_revalidate_window: swr_window,
773            override_no_store: false,
774            override_no_store_domains: vec![],
775            store_raw_html: false,
776        };
777        let rate_cfg = RateLimitConfig::default();
778        let robots_cfg = RobotsConfig {
779            respect: false,
780            ..RobotsConfig::default()
781        };
782        let pacer = Pacer::new(&rate_cfg);
783        let client = crate::fetcher::client::build_http_client("test/0.1", Duration::from_secs(5));
784        // Concrete URL is supplied by callers (each wiremock server has a
785        // different port).
786        let url = Url::parse("https://placeholder.invalid/").unwrap();
787        (tmp, db, url, cache_cfg, rate_cfg, robots_cfg, pacer, client).into_unzipped()
788    }
789
790    // Helper trait — tuple-swap to escape the borrow checker on the temp dir
791    // (the tempdir handle must outlive the Db).
792    #[cfg(any(test, feature = "test-loopback"))]
793    trait IntoUnzipped {
794        type Output;
795        fn into_unzipped(self) -> Self::Output;
796    }
797    #[cfg(any(test, feature = "test-loopback"))]
798    impl IntoUnzipped
799        for (
800            tempfile::TempDir,
801            crate::storage::Db,
802            Url,
803            crate::config::CacheConfig,
804            crate::config::RateLimitConfig,
805            crate::config::RobotsConfig,
806            crate::fetcher::concurrency::Pacer,
807            reqwest::Client,
808        )
809    {
810        type Output = (
811            crate::storage::Db,
812            Url,
813            crate::config::CacheConfig,
814            crate::config::RateLimitConfig,
815            crate::config::RobotsConfig,
816            crate::fetcher::concurrency::Pacer,
817            reqwest::Client,
818            tempfile::TempDir,
819        );
820        fn into_unzipped(self) -> Self::Output {
821            (
822                self.1, self.2, self.3, self.4, self.5, self.6, self.7, self.0,
823            )
824        }
825    }
826
827    #[cfg(any(test, feature = "test-loopback"))]
828    async fn insert_expired_page(
829        db: &crate::storage::Db,
830        url: &Url,
831        now: i64,
832        expired_secs_ago: i64,
833    ) {
834        let page = Page {
835            url_hash: url_hash(url.as_str()),
836            url: url.to_string(),
837            canonical_url: url.to_string(),
838            title: Some("old".into()),
839            fetched_at: now - expired_secs_ago - 60,
840            expires_at: Some(now - expired_secs_ago),
841            etag: None,
842            last_modified: None,
843            content_hash: "old-hash".into(),
844            extracted_md: "# old".into(),
845            metadata_json: None,
846            raw_html: None,
847            render_reason: None,
848        };
849        pages::upsert(db, page).await.unwrap();
850    }
851
852    fn fetch_opts_with_sync(sync: bool) -> FetchOptions {
853        FetchOptions {
854            force_refresh: false,
855            ssrf_level: SsrfLevel::Loopback,
856            ssrf_project_root: None,
857            har_recorder: None,
858            ignore_robots: false,
859            user_agent: "test/0.1".into(),
860            #[cfg(feature = "headless")]
861            headless: None,
862            headless_mode: HeadlessMode::Off,
863            synchronous_revalidation: sync,
864        }
865    }
866
867    #[cfg(any(test, feature = "test-loopback"))]
868    #[tokio::test]
869    async fn expired_within_window_serves_stale_swr() {
870        use std::time::Duration;
871        let (db, _placeholder, cache_cfg, rate_cfg, robots_cfg, pacer, client, _tmp) =
872            build_swr_test_fixture(Duration::from_secs(300)).await;
873        let url = Url::parse("https://example.com/within-window").unwrap();
874        let now = Timestamp::now().as_second();
875        insert_expired_page(&db, &url, now, 10).await; // expired 10s ago, window 300s
876
877        let result = fetch_with_cache(
878            &db,
879            &client,
880            &pacer,
881            &rate_cfg,
882            &robots_cfg,
883            &url,
884            &cache_cfg,
885            fetch_opts_with_sync(false),
886            |_, _| panic!("extract_fn must not be called on SWR stale-serve"),
887        )
888        .await
889        .expect("SWR path must succeed");
890        let task_id = match &result.cache_status {
891            CacheStatus::Stale {
892                revalidation_task_id,
893            } => revalidation_task_id
894                .as_ref()
895                .expect("SWR path must enqueue a revalidate task"),
896            other => panic!("expected CacheStatus::Stale, got {other:?}"),
897        };
898        // Confirm the task row actually landed in storage.
899        let row = crate::storage::tasks::get(&db, task_id)
900            .await
901            .unwrap()
902            .expect("revalidate task row present after SWR fast-path");
903        assert_eq!(row.kind, crate::storage::tasks::TaskKind::Revalidate);
904    }
905
906    #[cfg(any(test, feature = "test-loopback"))]
907    #[tokio::test]
908    async fn expired_beyond_window_falls_through_to_sync_fetch() {
909        use std::time::Duration;
910        use wiremock::matchers::method;
911        use wiremock::{Mock, MockServer, ResponseTemplate};
912
913        let server = MockServer::start().await;
914        Mock::given(method("GET"))
915            .respond_with(
916                ResponseTemplate::new(200)
917                    .set_body_string("<html><body>fresh content here</body></html>")
918                    .insert_header("content-type", "text/html; charset=utf-8")
919                    .insert_header("cache-control", "max-age=60"),
920            )
921            .mount(&server)
922            .await;
923
924        let (db, _placeholder, cache_cfg, rate_cfg, robots_cfg, pacer, client, _tmp) =
925            build_swr_test_fixture(Duration::from_secs(300)).await;
926        let url = Url::parse(&format!("{}/x", server.uri())).unwrap();
927        let now = Timestamp::now().as_second();
928        insert_expired_page(&db, &url, now, 3600).await; // expired 1h ago, window 5min
929
930        let result = fetch_with_cache(
931            &db,
932            &client,
933            &pacer,
934            &rate_cfg,
935            &robots_cfg,
936            &url,
937            &cache_cfg,
938            fetch_opts_with_sync(false), // caller did NOT request sync — grace window forces it
939            |_body, _base| {
940                Ok(ExtractResult {
941                    title: Some("fresh".into()),
942                    body_md: "fresh".into(),
943                    content_hash: "fresh-hash".into(),
944                    metadata: crate::extractor::metadata::ExtractedMetadata::default(),
945                })
946            },
947        )
948        .await
949        .expect("beyond-window expired entry must trigger a sync fetch");
950        assert_eq!(result.cache_status, CacheStatus::Miss);
951        // Row should now carry the new content_hash and a fresh fetched_at.
952        let row = pages::get_by_url(&db, url.as_str())
953            .await
954            .unwrap()
955            .expect("row present");
956        assert_eq!(row.content_hash, "fresh-hash");
957        assert!(row.fetched_at >= now);
958        // Wiremock saw exactly one request (no double-fetch).
959        assert_eq!(server.received_requests().await.unwrap().len(), 1);
960    }
961
962    #[cfg(any(test, feature = "test-loopback"))]
963    #[tokio::test]
964    async fn synchronous_revalidation_bypasses_swr_within_window() {
965        use std::time::Duration;
966        use wiremock::matchers::method;
967        use wiremock::{Mock, MockServer, ResponseTemplate};
968
969        let server = MockServer::start().await;
970        Mock::given(method("GET"))
971            .respond_with(
972                ResponseTemplate::new(200)
973                    .set_body_string("<html><body>fresh</body></html>")
974                    .insert_header("content-type", "text/html; charset=utf-8")
975                    .insert_header("cache-control", "max-age=60"),
976            )
977            .mount(&server)
978            .await;
979
980        let (db, _placeholder, cache_cfg, rate_cfg, robots_cfg, pacer, client, _tmp) =
981            build_swr_test_fixture(Duration::from_secs(300)).await;
982        let url = Url::parse(&format!("{}/y", server.uri())).unwrap();
983        let now = Timestamp::now().as_second();
984        insert_expired_page(&db, &url, now, 10).await; // within window…
985
986        let result = fetch_with_cache(
987            &db,
988            &client,
989            &pacer,
990            &rate_cfg,
991            &robots_cfg,
992            &url,
993            &cache_cfg,
994            fetch_opts_with_sync(true), // …but caller (e.g. CLI) opted out of SWR
995            |_body, _base| {
996                Ok(ExtractResult {
997                    title: Some("fresh".into()),
998                    body_md: "fresh".into(),
999                    content_hash: "fresh-hash".into(),
1000                    metadata: crate::extractor::metadata::ExtractedMetadata::default(),
1001                })
1002            },
1003        )
1004        .await
1005        .expect("synchronous opt-out must trigger a sync fetch");
1006        assert_eq!(result.cache_status, CacheStatus::Miss);
1007        let row = pages::get_by_url(&db, url.as_str())
1008            .await
1009            .unwrap()
1010            .expect("row present");
1011        assert!(row.fetched_at >= now);
1012        assert_eq!(server.received_requests().await.unwrap().len(), 1);
1013    }
1014}