Skip to main content

crw_renderer/
preference.rs

1//! Per-host renderer preference learning.
2//!
3//! Tracks a sliding window of LightPanda failures per normalized host and
4//! promotes the host to a heavier renderer (Chrome) when the failure rate
5//! crosses a threshold. The cache is bounded by entry count and entries
6//! expire on idle to keep memory predictable.
7//!
8//! ## Failure semantics
9//!
10//! Only LightPanda-specific failures count toward promotion (see
11//! [`FailoverErrorKind::counts_for_promotion`]). Cloudflare challenges,
12//! network errors, and "other" failures are recorded but do not drive
13//! promotion — that's the strict-predicate guard from the plan review.
14//!
15//! ## Concurrency
16//!
17//! Each host's stats live behind a single `Mutex` to avoid TOCTOU races
18//! between `record_failure` and `should_promote`. The cache itself is
19//! `moka` async, lock-free for reads.
20
21use crw_core::types::{FailoverErrorKind, RendererKind};
22use moka::future::Cache;
23use publicsuffix::{List, Psl};
24use std::collections::VecDeque;
25use std::sync::{Arc, Mutex, OnceLock};
26use std::time::{Duration, Instant};
27
28/// Maximum number of failures we remember per host (sliding window cap).
29const WINDOW_CAP: usize = 32;
30
31/// Sliding window length — failures older than this are discarded.
32const WINDOW_DURATION: Duration = Duration::from_secs(15 * 60);
33
34/// Default cache capacity (number of distinct hosts tracked).
35pub const DEFAULT_CAPACITY: u64 = 10_000;
36
37/// Default idle TTL: hosts unused for this long are evicted.
38pub const DEFAULT_TTL: Duration = Duration::from_secs(24 * 60 * 60);
39
40/// Failures within the sliding window required before promoting a host.
41const PROMOTION_THRESHOLD: usize = 3;
42
43#[derive(Debug)]
44struct WindowEntry {
45    at: Instant,
46    /// Whether this failure counts toward promotion (strict predicate).
47    counts: bool,
48}
49
50/// Per-host failure state. Single Mutex protects the entire view to avoid
51/// races between observation and decision.
52#[derive(Debug, Default)]
53pub struct RendererStats {
54    inner: Mutex<StatsInner>,
55}
56
57#[derive(Debug, Default)]
58struct StatsInner {
59    failures: VecDeque<WindowEntry>,
60    /// Whether this host has already been promoted (latched until reset).
61    promoted: bool,
62}
63
64impl RendererStats {
65    pub fn new() -> Self {
66        Self::default()
67    }
68
69    /// Record a failure observed against this host. Returns `true` if this
70    /// call caused a promotion transition (counter crossed the threshold).
71    pub fn record_failure(&self, kind: &FailoverErrorKind) -> bool {
72        let counts = kind.counts_for_promotion();
73        let now = Instant::now();
74        let mut inner = self.inner.lock().expect("RendererStats mutex poisoned");
75
76        // Drop expired entries.
77        while let Some(front) = inner.failures.front() {
78            if now.duration_since(front.at) > WINDOW_DURATION {
79                inner.failures.pop_front();
80            } else {
81                break;
82            }
83        }
84        if inner.failures.len() >= WINDOW_CAP {
85            inner.failures.pop_front();
86        }
87        inner.failures.push_back(WindowEntry { at: now, counts });
88
89        if inner.promoted {
90            return false;
91        }
92        let counting: usize = inner.failures.iter().filter(|e| e.counts).count();
93        if counting >= PROMOTION_THRESHOLD {
94            inner.promoted = true;
95            true
96        } else {
97            false
98        }
99    }
100
101    /// Record a successful render — clears the promotion latch and trims
102    /// half the window so a recovered host can return to LightPanda.
103    pub fn record_success(&self) {
104        let mut inner = self.inner.lock().expect("RendererStats mutex poisoned");
105        inner.promoted = false;
106        let drop_n = inner.failures.len() / 2;
107        for _ in 0..drop_n {
108            inner.failures.pop_front();
109        }
110    }
111
112    /// True if this host is currently promoted to a heavier renderer.
113    pub fn is_promoted(&self) -> bool {
114        self.inner
115            .lock()
116            .expect("RendererStats mutex poisoned")
117            .promoted
118    }
119}
120
121/// Per-host renderer preference cache. Cheap to clone (`Arc` inside).
122#[derive(Clone)]
123pub struct HostPreferences {
124    cache: Cache<String, Arc<RendererStats>>,
125}
126
127impl HostPreferences {
128    pub fn new(capacity: u64, ttl: Duration) -> Self {
129        let cache = Cache::builder()
130            .max_capacity(capacity)
131            .time_to_idle(ttl)
132            .build();
133        Self { cache }
134    }
135
136    pub fn with_defaults() -> Self {
137        Self::new(DEFAULT_CAPACITY, DEFAULT_TTL)
138    }
139
140    async fn stats_for(&self, host: &str) -> Arc<RendererStats> {
141        let key = host.to_string();
142        self.cache
143            .get_with(key, async { Arc::new(RendererStats::new()) })
144            .await
145    }
146
147    /// Record a failure for `host` (will be normalized). Returns the
148    /// promotion target if this call promoted the host, else `None`.
149    pub async fn record_failure(
150        &self,
151        host: &str,
152        kind: &FailoverErrorKind,
153    ) -> Option<RendererKind> {
154        let normalized = normalize_host(host);
155        let stats = self.stats_for(&normalized).await;
156        if stats.record_failure(kind) {
157            Some(RendererKind::Chrome)
158        } else {
159            None
160        }
161    }
162
163    /// Record a successful render for `host` (will be normalized).
164    pub async fn record_success(&self, host: &str) {
165        let normalized = normalize_host(host);
166        let stats = self.stats_for(&normalized).await;
167        stats.record_success();
168    }
169
170    /// Returns the preferred renderer for `host` if a promotion is in
171    /// effect, else `None` (caller falls back to default chain).
172    pub async fn preferred(&self, host: &str) -> Option<RendererKind> {
173        let normalized = normalize_host(host);
174        let stats = self.cache.get(&normalized).await?;
175        if stats.is_promoted() {
176            Some(RendererKind::Chrome)
177        } else {
178            None
179        }
180    }
181
182    /// Clear all preference state.
183    pub async fn reset_all(&self) {
184        self.cache.invalidate_all();
185        self.cache.run_pending_tasks().await;
186    }
187
188    /// Clear preference state for a specific host (will be normalized).
189    pub async fn reset_host(&self, host: &str) {
190        let normalized = normalize_host(host);
191        self.cache.invalidate(&normalized).await;
192    }
193
194    /// Current cache size (approximate).
195    pub fn size(&self) -> u64 {
196        self.cache.entry_count()
197    }
198}
199
200impl Default for HostPreferences {
201    fn default() -> Self {
202        Self::with_defaults()
203    }
204}
205
206// ── Host normalization ────────────────────────────────────────────────
207
208static PSL: OnceLock<List> = OnceLock::new();
209
210fn psl() -> &'static List {
211    PSL.get_or_init(|| {
212        // Embedded snapshot ships with publicsuffix.
213        include_str!("public_suffix_list.dat")
214            .parse()
215            .expect("embedded PSL must parse")
216    })
217}
218
219/// Normalize a host for cache keying:
220/// - lowercase, trim
221/// - strip a single leading `www.`
222/// - strip a trailing `.` (FQDN root) so `example.com.` and `example.com`
223///   share a cache entry
224/// - if the host parses as an IP literal (v4 or v6), return it raw — PSL
225///   does not understand IPs and would otherwise collapse `127.0.0.1`
226///   into `0.1`, colliding with every other `*.0.1` host
227/// - otherwise collapse to the registrable domain (eTLD+1) using the
228///   public suffix list
229///
230/// Multi-tenant hosts under a public suffix (e.g. `foo.myshopify.com`,
231/// `foo.vercel.app`) keep their tenant label because the suffix itself
232/// is `myshopify.com` / `vercel.app` — eTLD+1 ends up being the tenant.
233pub fn normalize_host(input: &str) -> String {
234    let lower = input.trim().trim_end_matches('.').to_ascii_lowercase();
235    let trimmed = lower.strip_prefix("www.").unwrap_or(&lower);
236
237    // IP literal? Bypass PSL — its eTLD+1 logic would corrupt the address.
238    if trimmed.parse::<std::net::IpAddr>().is_ok() {
239        return trimmed.to_string();
240    }
241    // Bracketed IPv6 (`[::1]`): strip brackets and parse.
242    if let Some(stripped) = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']'))
243        && stripped.parse::<std::net::IpAddr>().is_ok()
244    {
245        return stripped.to_string();
246    }
247
248    let bytes = trimmed.as_bytes();
249    match psl().domain(bytes) {
250        Some(domain) => std::str::from_utf8(domain.as_bytes())
251            .unwrap_or(trimmed)
252            .to_string(),
253        None => trimmed.to_string(),
254    }
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260
261    #[test]
262    fn normalizes_www_prefix() {
263        assert_eq!(normalize_host("www.example.com"), "example.com");
264    }
265
266    #[test]
267    fn keeps_shopify_tenant() {
268        assert_eq!(normalize_host("foo.myshopify.com"), "foo.myshopify.com");
269    }
270
271    #[test]
272    fn keeps_vercel_tenant() {
273        assert_eq!(normalize_host("myapp.vercel.app"), "myapp.vercel.app");
274    }
275
276    #[test]
277    fn collapses_subdomains_to_registrable() {
278        assert_eq!(normalize_host("a.b.example.com"), "example.com");
279    }
280
281    #[test]
282    fn handles_co_uk_etld() {
283        assert_eq!(normalize_host("www.example.co.uk"), "example.co.uk");
284    }
285
286    #[test]
287    fn case_insensitive() {
288        assert_eq!(normalize_host("WWW.Example.COM"), "example.com");
289    }
290
291    #[test]
292    fn ipv4_returns_raw() {
293        assert_eq!(normalize_host("127.0.0.1"), "127.0.0.1");
294        assert_eq!(normalize_host("192.168.0.1"), "192.168.0.1");
295    }
296
297    #[test]
298    fn ipv4_distinct_addresses_distinct_keys() {
299        // Pre-fix this collided into "0.1" via the PSL eTLD+1 logic.
300        assert_ne!(normalize_host("127.0.0.1"), normalize_host("192.168.0.1"));
301    }
302
303    #[test]
304    fn ipv6_bracketed_returns_unbracketed() {
305        assert_eq!(normalize_host("[::1]"), "::1");
306        assert_eq!(normalize_host("::1"), "::1");
307    }
308
309    #[test]
310    fn trailing_dot_stripped() {
311        assert_eq!(normalize_host("example.com."), "example.com");
312        assert_eq!(
313            normalize_host("example.com."),
314            normalize_host("example.com")
315        );
316    }
317
318    #[test]
319    fn renderer_stats_promotes_on_threshold() {
320        let stats = RendererStats::new();
321        assert!(!stats.record_failure(&FailoverErrorKind::NextJsClientError));
322        assert!(!stats.record_failure(&FailoverErrorKind::EmptyNextRoot));
323        assert!(stats.record_failure(&FailoverErrorKind::LightpandaTimeout));
324        assert!(stats.is_promoted());
325    }
326
327    #[test]
328    fn renderer_stats_strict_predicate_excludes_cf() {
329        let stats = RendererStats::new();
330        for _ in 0..5 {
331            stats.record_failure(&FailoverErrorKind::CloudflareChallenge);
332        }
333        assert!(!stats.is_promoted());
334    }
335
336    #[test]
337    fn renderer_stats_success_clears_promotion() {
338        let stats = RendererStats::new();
339        for _ in 0..3 {
340            stats.record_failure(&FailoverErrorKind::NextJsClientError);
341        }
342        assert!(stats.is_promoted());
343        stats.record_success();
344        assert!(!stats.is_promoted());
345    }
346
347    #[test]
348    fn renderer_stats_window_capped() {
349        let stats = RendererStats::new();
350        for _ in 0..(WINDOW_CAP + 10) {
351            stats.record_failure(&FailoverErrorKind::Other);
352        }
353        let inner = stats.inner.lock().unwrap();
354        assert!(inner.failures.len() <= WINDOW_CAP);
355    }
356
357    #[tokio::test]
358    async fn host_preferences_promotes_after_threshold() {
359        let prefs = HostPreferences::with_defaults();
360        for kind in [
361            FailoverErrorKind::NextJsClientError,
362            FailoverErrorKind::EmptyNextRoot,
363        ] {
364            assert_eq!(prefs.record_failure("example.com", &kind).await, None);
365        }
366        assert_eq!(
367            prefs
368                .record_failure("example.com", &FailoverErrorKind::LightpandaTimeout)
369                .await,
370            Some(RendererKind::Chrome)
371        );
372        assert_eq!(
373            prefs.preferred("example.com").await,
374            Some(RendererKind::Chrome)
375        );
376    }
377
378    #[tokio::test]
379    async fn host_preferences_normalize_collapses_subdomain() {
380        let prefs = HostPreferences::with_defaults();
381        for _ in 0..3 {
382            prefs
383                .record_failure("a.b.example.com", &FailoverErrorKind::NextJsClientError)
384                .await;
385        }
386        assert_eq!(
387            prefs.preferred("www.example.com").await,
388            Some(RendererKind::Chrome)
389        );
390    }
391
392    #[tokio::test]
393    async fn host_preferences_reset_clears_state() {
394        let prefs = HostPreferences::with_defaults();
395        for _ in 0..3 {
396            prefs
397                .record_failure("example.com", &FailoverErrorKind::NextJsClientError)
398                .await;
399        }
400        prefs.reset_all().await;
401        assert_eq!(prefs.preferred("example.com").await, None);
402    }
403}