1use jiff::Timestamp;
13use sha2::{Digest, Sha256};
14use url::Url;
15
16use super::FetcherError;
17use super::fetch::ConditionalGet;
18use super::ssrf::SsrfLevel;
19use super::ttl::{TtlDecision, compute_ttl};
20use crate::config::CacheConfig;
21use crate::extractor::metadata::ExtractedMetadata;
22use crate::storage::Db;
23use crate::storage::pages::{self, Page, url_hash};
24
25#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum CacheStatus {
32 Hit,
33 Stale {
34 revalidation_task_id: Option<String>,
35 },
36 Miss,
37}
38
39#[derive(Debug, Clone)]
43pub struct CachedFetch {
44 pub page: Page,
45 pub cache_status: CacheStatus,
46}
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
55pub enum HeadlessMode {
56 #[default]
58 Off,
59 On,
61 Auto,
63}
64
65#[derive(Debug, Clone)]
66pub struct FetchOptions {
67 pub force_refresh: bool,
68 pub ssrf_level: SsrfLevel,
69 pub ssrf_project_root: Option<std::path::PathBuf>,
71 pub har_recorder: Option<std::sync::Arc<crate::fetcher::har::HarRecorder>>,
73 pub ignore_robots: bool,
75 pub user_agent: String,
78 #[cfg(feature = "headless")]
84 pub headless: Option<crate::fetcher::headless::HeadlessHandle>,
85 pub headless_mode: HeadlessMode,
87 pub synchronous_revalidation: bool,
102}
103
104#[derive(Debug, Clone)]
107pub struct ExtractResult {
108 pub title: Option<String>,
109 pub body_md: String,
110 pub content_hash: String,
111 pub metadata: ExtractedMetadata,
112}
113
114#[allow(clippy::too_many_arguments)]
120pub async fn fetch_with_cache<F>(
121 db: &Db,
122 client: &reqwest::Client,
123 pacer: &crate::fetcher::concurrency::Pacer,
124 rate_cfg: &crate::config::RateLimitConfig,
125 robots_cfg: &crate::config::RobotsConfig,
126 url: &Url,
127 cache_cfg: &CacheConfig,
128 opts: FetchOptions,
129 mut extract_fn: F,
130) -> Result<CachedFetch, FetcherError>
131where
132 F: FnMut(&str, &Url) -> Result<ExtractResult, FetcherError>,
133{
134 let now = Timestamp::now().as_second();
135
136 let host = url
137 .host_str()
138 .ok_or(FetcherError::Ssrf(crate::fetcher::ssrf::SsrfError::NoHost))?;
139
140 let robots_skipped = !robots_cfg.respect
142 || opts.ignore_robots
143 || robots_cfg.ignore_domains.iter().any(|d| d == host);
144 let crawl_delay: Option<std::time::Duration> = if robots_skipped {
145 None
146 } else {
147 let entry = crate::fetcher::robots::ensure_entry(
148 db,
149 pacer,
150 client,
151 robots_cfg,
152 host,
153 opts.ssrf_level,
154 &opts.user_agent,
155 rate_cfg,
156 )
157 .await?;
158
159 let verdict = crate::fetcher::robots::evaluate(&entry, &opts.user_agent, url.path());
160 if matches!(verdict, crate::fetcher::robots::Verdict::Disallowed) {
161 return Err(FetcherError::RobotsDisallowed {
162 url: url.to_string(),
163 ua: opts.user_agent.clone(),
164 });
165 }
166 crate::fetcher::robots::crawl_delay(&entry, &opts.user_agent)
167 };
168
169 let swr_window_secs = cache_cfg.stale_while_revalidate_window.as_secs() as i64;
186 let stale: Option<Page> = if opts.force_refresh {
187 None
188 } else {
189 match lookup_cached(db, url).await? {
190 Some(p) if p.expires_at.is_some_and(|e| e > now) => {
191 return Ok(CachedFetch {
192 page: p,
193 cache_status: CacheStatus::Hit,
194 });
195 }
196 Some(p) => {
197 let within_swr_window = p
198 .expires_at
199 .is_some_and(|e| now.saturating_sub(e) <= swr_window_secs);
200 if within_swr_window && !opts.synchronous_revalidation {
201 let task_id = insert_revalidate_task(db, url, &p).await;
203 return Ok(CachedFetch {
204 page: p,
205 cache_status: CacheStatus::Stale {
206 revalidation_task_id: task_id,
207 },
208 });
209 }
210 Some(p)
213 }
214 None => None,
215 }
216 };
217
218 let cond = match &stale {
225 Some(p) => ConditionalGet {
226 if_none_match: p.etag.clone(),
227 if_modified_since: p.last_modified.clone(),
228 },
229 None => ConditionalGet::default(),
230 };
231
232 #[cfg(feature = "headless")]
245 let mut render_reason: Option<&'static str> = None;
246
247 let fetched = match opts.headless_mode {
248 HeadlessMode::Off | HeadlessMode::Auto => {
249 let retry_result = crate::fetcher::retry::with_retries(
250 db,
251 pacer,
252 client,
253 url,
254 opts.ssrf_level,
255 opts.ssrf_project_root.as_deref(),
256 opts.har_recorder.as_ref(),
257 &cond,
258 crawl_delay,
259 rate_cfg,
260 )
261 .await;
262
263 #[cfg(feature = "headless")]
271 let retry_result = match retry_result {
272 Err(FetcherError::BotChallenge {
273 url: ch_url,
274 provider,
275 }) if opts.headless_mode == HeadlessMode::Auto && opts.headless.is_some() => {
276 let handle = opts.headless.as_ref().expect("guarded by is_some()");
277 auto_render_delay(handle, url, "bot_challenge_bypass").await;
278 match handle.get().await {
279 Ok(r) => match r
280 .render(url, opts.ssrf_level, opts.ssrf_project_root.as_deref())
281 .await
282 {
283 Ok(rendered) => {
284 tracing::info!(target: "rover::fetcher::cached",
285 url = url.as_str(), provider = %provider,
286 "bot-protection challenge on HTTP fetch; bypassed via headless render");
287 render_reason = Some("bot_challenge");
288 Ok(rendered_to_fetched(rendered))
289 }
290 Err(render_err) => {
291 tracing::warn!(target: "rover::fetcher::cached",
292 error = %render_err, url = url.as_str(), provider = %provider,
293 "headless bypass of bot-protection challenge failed; returning challenge error");
294 Err(FetcherError::BotChallenge {
295 url: ch_url,
296 provider,
297 })
298 }
299 },
300 Err(launch_err) => {
301 tracing::warn!(target: "rover::fetcher::cached",
302 error = %launch_err, url = url.as_str(), provider = %provider,
303 "could not launch headless renderer to bypass bot-protection challenge; returning challenge error");
304 Err(FetcherError::BotChallenge {
305 url: ch_url,
306 provider,
307 })
308 }
309 }
310 }
311 other => other,
312 };
313
314 match retry_result {
315 Ok(f) => f,
316 Err(e) => {
317 if let Some(s) = stale {
324 let within_window = s
325 .expires_at
326 .is_some_and(|exp| now.saturating_sub(exp) <= swr_window_secs);
327 if within_window {
328 tracing::warn!(target: "rover::fetcher::cached",
329 error = %e, url = url.as_str(), "fetch failed; serving stale within SWR window");
330 let task_id = insert_revalidate_task(db, url, &s).await;
331 return Ok(CachedFetch {
332 page: s,
333 cache_status: CacheStatus::Stale {
334 revalidation_task_id: task_id,
335 },
336 });
337 }
338 tracing::warn!(target: "rover::fetcher::cached",
339 error = %e, url = url.as_str(),
340 "fetch failed; stale entry is beyond SWR window — propagating error rather than serving very old content");
341 }
342 return Err(e);
343 }
344 }
345 }
346 HeadlessMode::On => {
347 #[cfg(not(feature = "headless"))]
348 {
349 return Err(FetcherError::HeadlessFeatureNotCompiled);
350 }
351 #[cfg(feature = "headless")]
352 {
353 let r = opts
354 .headless
355 .as_ref()
356 .ok_or(FetcherError::HeadlessRendererUnavailable)?
357 .get()
358 .await?;
359 let rendered = r
360 .render(url, opts.ssrf_level, opts.ssrf_project_root.as_deref())
361 .await?;
362 render_reason = Some("on");
363 rendered_to_fetched(rendered)
364 }
365 }
366 };
367
368 if fetched.status == 304 {
376 let stale = stale.expect("304 implies a stale entry was sent");
377 let decision = compute_ttl(
378 now,
379 host,
380 fetched.cache_control.as_deref().unwrap_or(""),
381 fetched.expires.as_deref(),
382 cache_cfg,
383 );
384 let expires_at = match decision {
385 TtlDecision::Cache { expires_at } => Some(expires_at),
386 TtlDecision::DoNotCache => None,
387 };
388 pages::touch(db, &stale.url_hash, now, expires_at)
389 .await
390 .map_err(map_storage_err)?;
391 let mut page = stale;
392 page.fetched_at = now;
393 page.expires_at = expires_at;
394 return Ok(CachedFetch {
395 page,
396 cache_status: CacheStatus::Hit,
397 });
398 }
399
400 if !(200..300).contains(&fetched.status) {
401 return Err(FetcherError::Status {
402 status: fetched.status,
403 url: fetched.final_url.to_string(),
404 });
405 }
406
407 let extracted = extract_fn(&fetched.body, &fetched.final_url)?;
409
410 let (fetched, extracted) = if opts.headless_mode == HeadlessMode::Auto {
415 #[cfg(feature = "headless")]
416 {
417 if let Some(h) = opts.headless.as_ref() {
418 let hits =
419 crate::fetcher::headless::detect::detect_spa(&fetched.body, &extracted.body_md);
420 if hits.total >= 2 {
421 auto_render_delay(h, url, "spa_rerender").await;
422 let r = h.get().await?;
423 render_reason = Some("spa");
424 let rendered = r
425 .render(url, opts.ssrf_level, opts.ssrf_project_root.as_deref())
426 .await?;
427 let f2 = rendered_to_fetched(rendered);
428 let e2 = extract_fn(&f2.body, &f2.final_url)?;
429 (f2, e2)
430 } else {
431 (fetched, extracted)
432 }
433 } else {
434 (fetched, extracted)
435 }
436 }
437 #[cfg(not(feature = "headless"))]
438 {
439 (fetched, extracted)
440 }
441 } else {
442 (fetched, extracted)
443 };
444
445 let decision = compute_ttl(
447 now,
448 host,
449 fetched.cache_control.as_deref().unwrap_or(""),
450 fetched.expires.as_deref(),
451 cache_cfg,
452 );
453
454 let expires_at = match decision {
455 TtlDecision::Cache { expires_at } => Some(expires_at),
456 TtlDecision::DoNotCache => None,
457 };
458
459 let new_hash = url_hash(fetched.canonical_url.as_str());
460 let metadata_json = serde_json::to_string(&extracted.metadata).ok();
461 let raw_html = if cache_cfg.store_raw_html {
465 Some(fetched.body.as_bytes().to_vec())
466 } else {
467 None
468 };
469 #[cfg(feature = "headless")]
472 let render_reason = render_reason.map(str::to_owned);
473 #[cfg(not(feature = "headless"))]
474 let render_reason: Option<String> = None;
475 let page = Page {
476 url_hash: new_hash,
477 url: url.as_str().to_owned(),
478 canonical_url: fetched.canonical_url.as_str().to_owned(),
479 title: extracted.title.clone(),
480 fetched_at: now,
481 expires_at,
482 etag: fetched.etag.clone(),
483 last_modified: fetched.last_modified.clone(),
484 content_hash: extracted.content_hash.clone(),
485 extracted_md: extracted.body_md.clone(),
486 metadata_json,
487 raw_html,
488 render_reason,
489 };
490
491 if expires_at.is_some() {
493 pages::upsert(db, page.clone())
494 .await
495 .map_err(map_storage_err)?;
496 }
497
498 Ok(CachedFetch {
499 page,
500 cache_status: CacheStatus::Miss,
501 })
502}
503
504#[cfg(feature = "headless")]
517async fn auto_render_delay(
518 handle: &crate::fetcher::headless::HeadlessHandle,
519 url: &url::Url,
520 reason: &'static str,
521) {
522 let delay = handle.launch_delay();
523 if delay.is_zero() {
524 return;
525 }
526 tracing::debug!(
527 target: "rover::fetcher::cached",
528 url = url.as_str(),
529 delay_secs = delay.as_secs(),
530 reason,
531 "Auto-mode pre-render delay before headless launch",
532 );
533 tokio::time::sleep(delay).await;
534}
535
536#[cfg(feature = "headless")]
537fn rendered_to_fetched(
538 rendered: crate::fetcher::headless::RenderedPage,
539) -> crate::fetcher::FetchedPage {
540 use crate::fetcher::FetchedPage;
541 use crate::fetcher::canonical::extract_canonical_url;
542 use crate::fetcher::charset::Detected;
543
544 let canonical_url = extract_canonical_url(&rendered.html, &rendered.final_url, None);
545 FetchedPage {
546 final_url: rendered.final_url,
547 canonical_url,
548 status: rendered.status,
549 content_type: Some("text/html; charset=utf-8".to_string()),
550 body: rendered.html,
551 charset: Detected::default(),
552 link_header: None,
553 etag: None,
554 last_modified: None,
555 cache_control: None,
556 expires: None,
557 retry_after: None,
558 }
559}
560
561pub fn sha256_hex(bytes: &[u8]) -> String {
564 let mut h = Sha256::new();
565 h.update(bytes);
566 let out = h.finalize();
567 let mut s = String::with_capacity(out.len() * 2);
568 for b in out {
569 use std::fmt::Write as _;
570 write!(s, "{b:02x}").expect("write to String never fails");
571 }
572 s
573}
574
575async fn lookup_cached(db: &Db, url: &Url) -> Result<Option<Page>, FetcherError> {
576 let hash = url_hash(url.as_str());
577 if let Some(p) = pages::get_by_url_hash(db, &hash)
578 .await
579 .map_err(map_storage_err)?
580 {
581 return Ok(Some(p));
582 }
583 pages::get_by_url(db, url.as_str())
584 .await
585 .map_err(map_storage_err)
586}
587
588fn map_storage_err(e: crate::storage::StorageError) -> FetcherError {
589 tracing::error!(target: "rover::fetcher::cached", error = %e, "storage error");
590 FetcherError::Storage(e)
591}
592
593async fn insert_revalidate_task(db: &Db, url: &Url, stale: &Page) -> Option<String> {
598 use crate::storage::tasks::{TaskInsert, TaskKind, insert};
599 let params = serde_json::to_string(&crate::tasks::types::RevalidateParams {
600 url: url.to_string(),
601 etag_at_serve: stale.etag.clone(),
602 last_modified_at_serve: stale.last_modified.clone(),
603 })
604 .ok()?;
605 let id = uuid::Uuid::now_v7().to_string();
606 match insert(
607 db,
608 TaskInsert {
609 id: id.clone(),
610 kind: TaskKind::Revalidate,
611 params_json: params,
612 owner_pid: Some(std::process::id() as i64),
613 },
614 )
615 .await
616 {
617 Ok(()) => Some(id),
618 Err(e) => {
619 tracing::warn!(
620 target: "rover::fetcher::cached",
621 error = %e,
622 url = url.as_str(),
623 "failed to enqueue revalidate task; serving stale without revalidation",
624 );
625 None
626 }
627 }
628}
629
630#[cfg(test)]
631mod tests {
632 use super::*;
633
634 #[test]
635 fn cache_status_eq() {
636 assert_ne!(
637 CacheStatus::Hit,
638 CacheStatus::Stale {
639 revalidation_task_id: None
640 }
641 );
642 }
643
644 #[test]
645 fn map_storage_err_routes_to_storage_variant() {
646 let storage_err = crate::storage::StorageError::from(rusqlite::Error::QueryReturnedNoRows);
649 let mapped = map_storage_err(storage_err);
650 assert!(matches!(mapped, FetcherError::Storage(_)));
651 assert!(mapped.to_string().starts_with("storage error:"));
652 }
653
654 #[test]
655 fn sha256_hex_matches_known() {
656 assert_eq!(
657 sha256_hex(b""),
658 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
659 );
660 }
661
662 #[tokio::test]
663 async fn cache_hit_within_ttl() {
664 use crate::config::{RateLimitConfig, RobotsConfig};
665 use crate::fetcher::concurrency::Pacer;
666 use crate::storage::Db;
667 use std::time::Duration;
668 use tempfile::tempdir;
669 let tmp = tempdir().unwrap();
670 let db = Db::open(tmp.path().join("rover.db")).await.unwrap();
671 let url = Url::parse("https://example.com/").unwrap();
672 let now = Timestamp::now().as_second();
673 let page = Page {
674 url_hash: url_hash(url.as_str()),
675 url: url.to_string(),
676 canonical_url: url.to_string(),
677 title: Some("cached".into()),
678 fetched_at: now - 60,
679 expires_at: Some(now + 600),
680 etag: None,
681 last_modified: None,
682 content_hash: "x".into(),
683 extracted_md: "# cached".into(),
684 metadata_json: None,
685 raw_html: None,
686 render_reason: None,
687 };
688 pages::upsert(&db, page.clone()).await.unwrap();
689
690 let cache_cfg = CacheConfig {
691 default_ttl: Duration::from_secs(3600),
692 min_ttl: Duration::from_secs(60),
693 max_ttl: Duration::from_secs(86400),
694 stale_while_revalidate_window: Duration::from_secs(300),
695 override_no_store: false,
696 override_no_store_domains: vec![],
697 store_raw_html: false,
698 };
699 let rate_cfg = RateLimitConfig::default();
700 let robots_cfg = RobotsConfig {
702 respect: false,
703 ..RobotsConfig::default()
704 };
705 let pacer = Pacer::new(&rate_cfg);
706 let client = super::super::client::build_http_client("test/0.1", Duration::from_secs(5));
707 let result = fetch_with_cache(
708 &db,
709 &client,
710 &pacer,
711 &rate_cfg,
712 &robots_cfg,
713 &url,
714 &cache_cfg,
715 FetchOptions {
716 force_refresh: false,
717 ssrf_level: SsrfLevel::Strict,
718 ssrf_project_root: None,
719 har_recorder: None,
720 ignore_robots: false,
721 user_agent: "test/0.1".into(),
722 #[cfg(feature = "headless")]
723 headless: None,
724 headless_mode: HeadlessMode::Off,
725 synchronous_revalidation: false,
726 },
727 |_, _| {
728 panic!("extract_fn must not be called on cache hit");
729 },
730 )
731 .await
732 .unwrap();
733 assert_eq!(result.cache_status, CacheStatus::Hit);
734 assert_eq!(result.page.title.as_deref(), Some("cached"));
735 }
736
737 #[cfg(any(test, feature = "test-loopback"))]
750 async fn build_swr_test_fixture(
751 swr_window: std::time::Duration,
752 ) -> (
753 crate::storage::Db,
754 Url,
755 crate::config::CacheConfig,
756 crate::config::RateLimitConfig,
757 crate::config::RobotsConfig,
758 crate::fetcher::concurrency::Pacer,
759 reqwest::Client,
760 tempfile::TempDir,
761 ) {
762 use crate::config::{RateLimitConfig, RobotsConfig};
763 use crate::fetcher::concurrency::Pacer;
764 use crate::storage::Db;
765 use std::time::Duration;
766 let tmp = tempfile::tempdir().unwrap();
767 let db = Db::open(tmp.path().join("rover.db")).await.unwrap();
768 let cache_cfg = CacheConfig {
769 default_ttl: Duration::from_secs(3600),
770 min_ttl: Duration::from_secs(0),
771 max_ttl: Duration::from_secs(86400),
772 stale_while_revalidate_window: swr_window,
773 override_no_store: false,
774 override_no_store_domains: vec![],
775 store_raw_html: false,
776 };
777 let rate_cfg = RateLimitConfig::default();
778 let robots_cfg = RobotsConfig {
779 respect: false,
780 ..RobotsConfig::default()
781 };
782 let pacer = Pacer::new(&rate_cfg);
783 let client = crate::fetcher::client::build_http_client("test/0.1", Duration::from_secs(5));
784 let url = Url::parse("https://placeholder.invalid/").unwrap();
787 (tmp, db, url, cache_cfg, rate_cfg, robots_cfg, pacer, client).into_unzipped()
788 }
789
790 #[cfg(any(test, feature = "test-loopback"))]
793 trait IntoUnzipped {
794 type Output;
795 fn into_unzipped(self) -> Self::Output;
796 }
797 #[cfg(any(test, feature = "test-loopback"))]
798 impl IntoUnzipped
799 for (
800 tempfile::TempDir,
801 crate::storage::Db,
802 Url,
803 crate::config::CacheConfig,
804 crate::config::RateLimitConfig,
805 crate::config::RobotsConfig,
806 crate::fetcher::concurrency::Pacer,
807 reqwest::Client,
808 )
809 {
810 type Output = (
811 crate::storage::Db,
812 Url,
813 crate::config::CacheConfig,
814 crate::config::RateLimitConfig,
815 crate::config::RobotsConfig,
816 crate::fetcher::concurrency::Pacer,
817 reqwest::Client,
818 tempfile::TempDir,
819 );
820 fn into_unzipped(self) -> Self::Output {
821 (
822 self.1, self.2, self.3, self.4, self.5, self.6, self.7, self.0,
823 )
824 }
825 }
826
827 #[cfg(any(test, feature = "test-loopback"))]
828 async fn insert_expired_page(
829 db: &crate::storage::Db,
830 url: &Url,
831 now: i64,
832 expired_secs_ago: i64,
833 ) {
834 let page = Page {
835 url_hash: url_hash(url.as_str()),
836 url: url.to_string(),
837 canonical_url: url.to_string(),
838 title: Some("old".into()),
839 fetched_at: now - expired_secs_ago - 60,
840 expires_at: Some(now - expired_secs_ago),
841 etag: None,
842 last_modified: None,
843 content_hash: "old-hash".into(),
844 extracted_md: "# old".into(),
845 metadata_json: None,
846 raw_html: None,
847 render_reason: None,
848 };
849 pages::upsert(db, page).await.unwrap();
850 }
851
852 fn fetch_opts_with_sync(sync: bool) -> FetchOptions {
853 FetchOptions {
854 force_refresh: false,
855 ssrf_level: SsrfLevel::Loopback,
856 ssrf_project_root: None,
857 har_recorder: None,
858 ignore_robots: false,
859 user_agent: "test/0.1".into(),
860 #[cfg(feature = "headless")]
861 headless: None,
862 headless_mode: HeadlessMode::Off,
863 synchronous_revalidation: sync,
864 }
865 }
866
867 #[cfg(any(test, feature = "test-loopback"))]
868 #[tokio::test]
869 async fn expired_within_window_serves_stale_swr() {
870 use std::time::Duration;
871 let (db, _placeholder, cache_cfg, rate_cfg, robots_cfg, pacer, client, _tmp) =
872 build_swr_test_fixture(Duration::from_secs(300)).await;
873 let url = Url::parse("https://example.com/within-window").unwrap();
874 let now = Timestamp::now().as_second();
875 insert_expired_page(&db, &url, now, 10).await; let result = fetch_with_cache(
878 &db,
879 &client,
880 &pacer,
881 &rate_cfg,
882 &robots_cfg,
883 &url,
884 &cache_cfg,
885 fetch_opts_with_sync(false),
886 |_, _| panic!("extract_fn must not be called on SWR stale-serve"),
887 )
888 .await
889 .expect("SWR path must succeed");
890 let task_id = match &result.cache_status {
891 CacheStatus::Stale {
892 revalidation_task_id,
893 } => revalidation_task_id
894 .as_ref()
895 .expect("SWR path must enqueue a revalidate task"),
896 other => panic!("expected CacheStatus::Stale, got {other:?}"),
897 };
898 let row = crate::storage::tasks::get(&db, task_id)
900 .await
901 .unwrap()
902 .expect("revalidate task row present after SWR fast-path");
903 assert_eq!(row.kind, crate::storage::tasks::TaskKind::Revalidate);
904 }
905
906 #[cfg(any(test, feature = "test-loopback"))]
907 #[tokio::test]
908 async fn expired_beyond_window_falls_through_to_sync_fetch() {
909 use std::time::Duration;
910 use wiremock::matchers::method;
911 use wiremock::{Mock, MockServer, ResponseTemplate};
912
913 let server = MockServer::start().await;
914 Mock::given(method("GET"))
915 .respond_with(
916 ResponseTemplate::new(200)
917 .set_body_string("<html><body>fresh content here</body></html>")
918 .insert_header("content-type", "text/html; charset=utf-8")
919 .insert_header("cache-control", "max-age=60"),
920 )
921 .mount(&server)
922 .await;
923
924 let (db, _placeholder, cache_cfg, rate_cfg, robots_cfg, pacer, client, _tmp) =
925 build_swr_test_fixture(Duration::from_secs(300)).await;
926 let url = Url::parse(&format!("{}/x", server.uri())).unwrap();
927 let now = Timestamp::now().as_second();
928 insert_expired_page(&db, &url, now, 3600).await; let result = fetch_with_cache(
931 &db,
932 &client,
933 &pacer,
934 &rate_cfg,
935 &robots_cfg,
936 &url,
937 &cache_cfg,
938 fetch_opts_with_sync(false), |_body, _base| {
940 Ok(ExtractResult {
941 title: Some("fresh".into()),
942 body_md: "fresh".into(),
943 content_hash: "fresh-hash".into(),
944 metadata: crate::extractor::metadata::ExtractedMetadata::default(),
945 })
946 },
947 )
948 .await
949 .expect("beyond-window expired entry must trigger a sync fetch");
950 assert_eq!(result.cache_status, CacheStatus::Miss);
951 let row = pages::get_by_url(&db, url.as_str())
953 .await
954 .unwrap()
955 .expect("row present");
956 assert_eq!(row.content_hash, "fresh-hash");
957 assert!(row.fetched_at >= now);
958 assert_eq!(server.received_requests().await.unwrap().len(), 1);
960 }
961
962 #[cfg(any(test, feature = "test-loopback"))]
963 #[tokio::test]
964 async fn synchronous_revalidation_bypasses_swr_within_window() {
965 use std::time::Duration;
966 use wiremock::matchers::method;
967 use wiremock::{Mock, MockServer, ResponseTemplate};
968
969 let server = MockServer::start().await;
970 Mock::given(method("GET"))
971 .respond_with(
972 ResponseTemplate::new(200)
973 .set_body_string("<html><body>fresh</body></html>")
974 .insert_header("content-type", "text/html; charset=utf-8")
975 .insert_header("cache-control", "max-age=60"),
976 )
977 .mount(&server)
978 .await;
979
980 let (db, _placeholder, cache_cfg, rate_cfg, robots_cfg, pacer, client, _tmp) =
981 build_swr_test_fixture(Duration::from_secs(300)).await;
982 let url = Url::parse(&format!("{}/y", server.uri())).unwrap();
983 let now = Timestamp::now().as_second();
984 insert_expired_page(&db, &url, now, 10).await; let result = fetch_with_cache(
987 &db,
988 &client,
989 &pacer,
990 &rate_cfg,
991 &robots_cfg,
992 &url,
993 &cache_cfg,
994 fetch_opts_with_sync(true), |_body, _base| {
996 Ok(ExtractResult {
997 title: Some("fresh".into()),
998 body_md: "fresh".into(),
999 content_hash: "fresh-hash".into(),
1000 metadata: crate::extractor::metadata::ExtractedMetadata::default(),
1001 })
1002 },
1003 )
1004 .await
1005 .expect("synchronous opt-out must trigger a sync fetch");
1006 assert_eq!(result.cache_status, CacheStatus::Miss);
1007 let row = pages::get_by_url(&db, url.as_str())
1008 .await
1009 .unwrap()
1010 .expect("row present");
1011 assert!(row.fetched_at >= now);
1012 assert_eq!(server.received_requests().await.unwrap().len(), 1);
1013 }
1014}