Skip to main content

nab/detect/
labyrinth.rs

1//! Cloudflare AI Labyrinth and similar bot-trap detection.
2//!
3//! [Cloudflare AI Labyrinth](https://blog.cloudflare.com/ai-labyrinth/)
4//! is a defence-in-depth system that serves *invisible*, hidden-link AI
5//! generated content to suspected scrapers. The pages contain real
6//! scientific facts, but have nothing to do with the host site, are
7//! marked `noindex`, and are riddled with `display:none` / `aria-hidden`
8//! anchors. Any client that follows two such links is added to
9//! Cloudflare's bot fingerprint database.
10//!
11//! `nab` is *not* a bot — but its fetcher behaves enough like one to
12//! get caught. This module detects the trap from the HTML body alone
13//! using five cheap signals:
14//!
15//! 1. **Hidden-link density** — ratio of `<a>` elements that are
16//!    invisible (CSS `display:none`, `visibility:hidden`, `opacity:0`,
17//!    or `aria-hidden="true"`) versus total links.
18//! 2. **Topic drift** — keyword overlap between `<title>` and the
19//!    first 500 chars of `<body>` text. AI Labyrinth pages tend to
20//!    drift wildly because the body is generated, the title is the
21//!    host site's chrome.
22//! 3. **`noindex` + heavy body** — a `<meta name="robots"
23//!    content="noindex">` directive combined with >2000 characters of
24//!    rendered text. Real noindex pages (login walls, search results)
25//!    are usually thin.
26//! 4. **Text-structure fingerprint** — sentence-length variance.
27//!    Human writing is bursty (short sentences interleaved with long
28//!    ones); generated text is much more uniform.
29//! 5. **Link-graph fanout** — number of distinct `<a>` targets that
30//!    look auto-generated (random slugs, repeated path prefixes).
31//!
32//! Each signal contributes a weighted score. The total maps to a
33//! [`Verdict`]: `Clean` (<30), `Suspicious` (30..60), or `Trap` (>=60).
34//!
35//! The scorer is deliberately conservative on the high end so that
36//! tripping `Trap` requires multiple signals to agree. This module
37//! never makes a network call.
38//!
39//! # Example
40//!
41//! ```no_run
42//! use nab::detect::{detect_labyrinth, Verdict};
43//! use url::Url;
44//!
45//! let html = "<html><body><h1>Hi</h1></body></html>";
46//! let url = Url::parse("https://example.com/").unwrap();
47//! let score = detect_labyrinth(html, &url);
48//! assert!(matches!(score.verdict, Verdict::Clean));
49//! ```
50
51use std::collections::HashSet;
52
53use scraper::{Html, Selector};
54use std::sync::LazyLock;
55use url::Url;
56
57// ── Thresholds ───────────────────────────────────────────────────────
58//
59// Verdict thresholds. Tuned so that legitimate pages with the
60// occasional `display:none` cookie banner stay below 30, and a fully
61// constructed labyrinth page comfortably exceeds 60. See the unit
62// tests in this file for the calibration cases.
63const SUSPICIOUS_THRESHOLD: f32 = 30.0;
64const TRAP_THRESHOLD: f32 = 60.0;
65
66// ── Selectors (compiled once) ────────────────────────────────────────
67
68static A_SELECTOR: LazyLock<Selector> =
69    LazyLock::new(|| Selector::parse("a").expect("static <a> selector"));
70static TITLE_SELECTOR: LazyLock<Selector> =
71    LazyLock::new(|| Selector::parse("title").expect("static <title> selector"));
72static BODY_SELECTOR: LazyLock<Selector> =
73    LazyLock::new(|| Selector::parse("body").expect("static <body> selector"));
74static META_ROBOTS_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
75    Selector::parse(r#"meta[name="robots"], meta[name="ROBOTS"]"#)
76        .expect("static meta robots selector")
77});
78
79// ── Public types ─────────────────────────────────────────────────────
80
81/// Final classification of a fetched page.
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub enum Verdict {
84    /// Page looks like normal human content. Score < 30.
85    Clean,
86    /// Page exhibits one or two trap-like properties. Score 30..60.
87    /// Worth logging, not worth aborting.
88    Suspicious,
89    /// Page is almost certainly a bot trap. Score >= 60.
90    /// Caller should refuse to return content.
91    Trap,
92}
93
94/// One contribution to the total score.
95#[derive(Debug, Clone, PartialEq)]
96pub struct Signal {
97    /// Short stable identifier for the signal (e.g. `"hidden_link_density"`).
98    pub name: &'static str,
99    /// Raw measurement (ratio, character count, variance, etc.).
100    pub value: f32,
101    /// Weighted score added to the total. Range 0..=40.
102    pub score: f32,
103    /// Human-readable explanation for logs / `--verbose`.
104    pub detail: String,
105}
106
107/// Result of running [`detect_labyrinth`] on a page.
108#[derive(Debug, Clone)]
109pub struct LabyrinthScore {
110    /// Sum of all signal scores.
111    pub total: f32,
112    /// Per-signal breakdown for debugging.
113    pub signals: Vec<Signal>,
114    /// Final classification.
115    pub verdict: Verdict,
116}
117
118impl LabyrinthScore {
119    /// `true` if the verdict is `Trap`.
120    #[must_use]
121    pub fn is_trap(&self) -> bool {
122        matches!(self.verdict, Verdict::Trap)
123    }
124
125    /// `true` if the verdict is `Suspicious` or worse.
126    #[must_use]
127    pub fn is_suspicious(&self) -> bool {
128        matches!(self.verdict, Verdict::Suspicious | Verdict::Trap)
129    }
130}
131
132/// Inspect `html` and assign a labyrinth score.
133///
134/// `url` is used for link-graph fanout (resolving relative anchors and
135/// classifying internal vs external links). The function never
136/// performs network IO.
137///
138/// # Performance
139///
140/// One full HTML parse plus a handful of CSS selector walks. On a
141/// 200 KB document this completes in single-digit milliseconds on a
142/// modern laptop.
143#[must_use]
144pub fn detect_labyrinth(html: &str, url: &Url) -> LabyrinthScore {
145    let doc = Html::parse_document(html);
146
147    let signals = vec![
148        score_hidden_link_density(&doc),
149        score_topic_drift(&doc),
150        score_meta_directive(&doc),
151        score_text_structure(&doc),
152        score_link_graph_fanout(&doc, url),
153    ];
154
155    let total: f32 = signals.iter().map(|s| s.score).sum();
156    let verdict = classify(total);
157
158    LabyrinthScore {
159        total,
160        signals,
161        verdict,
162    }
163}
164
165fn classify(total: f32) -> Verdict {
166    if total >= TRAP_THRESHOLD {
167        Verdict::Trap
168    } else if total >= SUSPICIOUS_THRESHOLD {
169        Verdict::Suspicious
170    } else {
171        Verdict::Clean
172    }
173}
174
175// ── Signal 1: hidden-link density ────────────────────────────────────
176
177/// Returns `true` if the inline `style` attribute or `aria-hidden`
178/// attribute marks the element as visually hidden. We deliberately do
179/// not parse stylesheets — labyrinth pages embed the hiding inline so
180/// the trap renders identically with or without external CSS.
181fn looks_hidden(style: Option<&str>, aria_hidden: Option<&str>, hidden_attr: bool) -> bool {
182    if hidden_attr {
183        return true;
184    }
185    if matches!(aria_hidden, Some(v) if v.eq_ignore_ascii_case("true")) {
186        return true;
187    }
188    let Some(style) = style else { return false };
189    let s: String = style.chars().filter(|c| !c.is_whitespace()).collect();
190    let s = s.to_ascii_lowercase();
191    s.contains("display:none")
192        || s.contains("visibility:hidden")
193        || s.contains("opacity:0")
194        // CSS shorthand: "opacity:0;" matches above; also catch "opacity:.0".
195        || s.contains("opacity:.0")
196}
197
198fn score_hidden_link_density(doc: &Html) -> Signal {
199    let mut total = 0u32;
200    let mut hidden = 0u32;
201
202    for a in doc.select(&A_SELECTOR) {
203        total += 1;
204        let el = a.value();
205        if looks_hidden(
206            el.attr("style"),
207            el.attr("aria-hidden"),
208            el.attr("hidden").is_some(),
209        ) {
210            hidden += 1;
211        }
212    }
213
214    let ratio = if total == 0 {
215        0.0
216    } else {
217        f32::from(u16::try_from(hidden).unwrap_or(u16::MAX))
218            / f32::from(u16::try_from(total).unwrap_or(u16::MAX))
219    };
220
221    // Cliff-edge weighting: <2% normal, 2-10% odd, 10-30% suspicious,
222    // >30% almost certainly a trap. The Cloudflare reference pages run
223    // ~80-100% hidden, so we cap the score at 40 once we cross 30%.
224    //
225    // We *also* require a minimum number of hidden links to avoid
226    // false-positives on legitimate pages with one or two hidden cookie
227    // banner / privacy / accessibility anchors. Real labyrinth pages
228    // have dozens of hidden anchors.
229    let score = if total < 10 || hidden < 5 {
230        // Too few links — or too few hidden — to be statistically meaningful.
231        0.0
232    } else if ratio >= 0.30 {
233        40.0
234    } else if ratio >= 0.10 {
235        25.0
236    } else if ratio >= 0.02 {
237        8.0
238    } else {
239        0.0
240    };
241
242    Signal {
243        name: "hidden_link_density",
244        value: ratio,
245        score,
246        detail: format!("{hidden}/{total} links hidden ({:.1}%)", ratio * 100.0),
247    }
248}
249
250// ── Signal 2: topic drift ────────────────────────────────────────────
251
252/// Lowercase, strip punctuation, split into >=4 character tokens.
253/// Stop-words and short tokens contribute nothing to topical
254/// similarity, so we drop them aggressively to keep the heuristic
255/// sharp on small samples.
256fn tokenize(s: &str) -> HashSet<String> {
257    s.split(|c: char| !c.is_alphanumeric())
258        .filter(|w| w.len() >= 4)
259        .map(str::to_ascii_lowercase)
260        .filter(|w| !STOPWORDS.contains(&w.as_str()))
261        .collect()
262}
263
264const STOPWORDS: &[&str] = &[
265    "this", "that", "with", "from", "have", "been", "were", "they", "their", "there", "which",
266    "about", "would", "could", "should", "into", "more", "than", "then", "what", "when", "your",
267    "will", "also", "such", "some", "other", "these", "those", "page", "site",
268];
269
270fn score_topic_drift(doc: &Html) -> Signal {
271    let title_text: String = doc
272        .select(&TITLE_SELECTOR)
273        .next()
274        .map(|t| t.text().collect::<String>())
275        .unwrap_or_default();
276
277    let body_text: String = doc
278        .select(&BODY_SELECTOR)
279        .next()
280        .map(|b| b.text().collect::<Vec<_>>().join(" "))
281        .unwrap_or_default();
282
283    let body_sample: String = body_text.chars().take(500).collect();
284
285    let title_tokens = tokenize(&title_text);
286    let body_tokens = tokenize(&body_sample);
287
288    if title_tokens.is_empty() || body_tokens.is_empty() {
289        // Cannot judge — abstain.
290        return Signal {
291            name: "topic_drift",
292            value: 0.0,
293            score: 0.0,
294            detail: "title or body too short to compare".to_string(),
295        };
296    }
297
298    let intersection = title_tokens.intersection(&body_tokens).count();
299    let union = title_tokens.union(&body_tokens).count();
300    // Jaccard similarity. Range 0..=1.
301    let similarity = if union == 0 {
302        0.0
303    } else {
304        intersection as f32 / union as f32
305    };
306
307    // Drift is the inverse of similarity. We only penalise *combined*
308    // with rich body text — short pages legitimately have unrelated
309    // chrome titles ("Login | Acme") and short forms.
310    let score = if body_text.len() < 800 {
311        0.0
312    } else if similarity < 0.05 {
313        20.0
314    } else if similarity < 0.15 {
315        8.0
316    } else {
317        0.0
318    };
319
320    Signal {
321        name: "topic_drift",
322        value: similarity,
323        score,
324        detail: format!(
325            "title∩body Jaccard={:.2} (title={}t, body={}t)",
326            similarity,
327            title_tokens.len(),
328            body_tokens.len()
329        ),
330    }
331}
332
333// ── Signal 3: noindex + rich body ────────────────────────────────────
334
335fn score_meta_directive(doc: &Html) -> Signal {
336    let mut noindex = false;
337    for meta in doc.select(&META_ROBOTS_SELECTOR) {
338        if let Some(content) = meta.value().attr("content")
339            && content.to_ascii_lowercase().contains("noindex")
340        {
341            noindex = true;
342            break;
343        }
344    }
345
346    let body_chars = doc
347        .select(&BODY_SELECTOR)
348        .next()
349        .map_or(0, |b| b.text().map(str::len).sum::<usize>());
350
351    // Real noindex pages (search results, login pages) are usually
352    // thin. A rich, articulate noindex page is the labyrinth's
353    // signature.
354    let score = if noindex && body_chars > 2000 {
355        20.0
356    } else if noindex && body_chars > 800 {
357        6.0
358    } else {
359        0.0
360    };
361
362    Signal {
363        name: "meta_directive",
364        value: if noindex { 1.0 } else { 0.0 },
365        score,
366        detail: format!("noindex={noindex}, body_chars={body_chars}"),
367    }
368}
369
370// ── Signal 4: text structure fingerprint ─────────────────────────────
371
372fn sentence_lengths(text: &str) -> Vec<usize> {
373    let mut out = Vec::new();
374    let mut current: usize = 0;
375    for ch in text.chars() {
376        if ch == '.' || ch == '!' || ch == '?' {
377            if current >= 3 {
378                out.push(current);
379            }
380            current = 0;
381        } else if !ch.is_whitespace() {
382            current += 1;
383        } else if current > 0 {
384            // count whitespace as part of the in-progress sentence
385            current += 1;
386        }
387    }
388    if current >= 3 {
389        out.push(current);
390    }
391    out
392}
393
394fn variance(samples: &[usize]) -> f32 {
395    if samples.len() < 2 {
396        return 0.0;
397    }
398    let mean: f32 = samples.iter().map(|&n| n as f32).sum::<f32>() / samples.len() as f32;
399    let var: f32 = samples
400        .iter()
401        .map(|&n| {
402            let d = n as f32 - mean;
403            d * d
404        })
405        .sum::<f32>()
406        / samples.len() as f32;
407    var
408}
409
410fn score_text_structure(doc: &Html) -> Signal {
411    let body_text: String = doc
412        .select(&BODY_SELECTOR)
413        .next()
414        .map(|b| b.text().collect::<Vec<_>>().join(" "))
415        .unwrap_or_default();
416
417    let lengths = sentence_lengths(&body_text);
418    if lengths.len() < 8 {
419        return Signal {
420            name: "text_structure",
421            value: 0.0,
422            score: 0.0,
423            detail: format!("only {} sentences, abstaining", lengths.len()),
424        };
425    }
426    let var = variance(&lengths);
427    let mean: f32 = lengths.iter().map(|&n| n as f32).sum::<f32>() / lengths.len() as f32;
428    // Coefficient of variation: dimensionless, comparable across pages.
429    let cv = if mean > 0.0 { var.sqrt() / mean } else { 0.0 };
430
431    // Human prose typically has CV >= 0.5. AI-generated trap text in
432    // the wild measures around 0.15-0.30. Below 0.20 is a strong tell.
433    let score = if cv < 0.20 {
434        15.0
435    } else if cv < 0.30 {
436        6.0
437    } else {
438        0.0
439    };
440
441    Signal {
442        name: "text_structure",
443        value: cv,
444        score,
445        detail: format!("{} sentences, mean={mean:.1}, cv={cv:.2}", lengths.len()),
446    }
447}
448
449// ── Signal 5: link-graph fanout ──────────────────────────────────────
450
451fn score_link_graph_fanout(doc: &Html, base: &Url) -> Signal {
452    let base_host = base.host_str().unwrap_or("");
453    let mut internal_targets: HashSet<String> = HashSet::new();
454    let mut slug_like: u32 = 0;
455
456    for a in doc.select(&A_SELECTOR) {
457        let Some(href) = a.value().attr("href") else {
458            continue;
459        };
460        let Ok(resolved) = base.join(href) else {
461            continue;
462        };
463        if resolved.host_str() != Some(base_host) || base_host.is_empty() {
464            continue;
465        }
466        let path = resolved.path().to_string();
467        // "Slug-like" = last path segment is long (>=10 chars) and
468        // contains both letters and digits, the typical look of an
469        // auto-generated trap URL like /article/abc123def-kepler.
470        if let Some(last) = path.rsplit('/').find(|s| !s.is_empty())
471            && last.len() >= 10
472            && last.chars().any(|c| c.is_ascii_digit())
473            && last.chars().any(|c| c.is_ascii_alphabetic())
474        {
475            slug_like += 1;
476        }
477        internal_targets.insert(path);
478    }
479
480    let distinct = internal_targets.len();
481    // Pages that fan out to dozens of slug-like internal targets are
482    // suspicious; legitimate articles rarely cross 10.
483    let score = if distinct >= 40 && slug_like >= 20 {
484        15.0
485    } else if distinct >= 20 && slug_like >= 10 {
486        8.0
487    } else {
488        0.0
489    };
490
491    Signal {
492        name: "link_graph_fanout",
493        value: distinct as f32,
494        score,
495        detail: format!("{distinct} distinct internal targets, {slug_like} slug-like"),
496    }
497}
498
499// ─────────────────────────────────────────────────────────────────────
500// Tests
501// ─────────────────────────────────────────────────────────────────────
502#[cfg(test)]
503#[allow(clippy::format_push_string)] // tests build small fixtures, readability > micro-perf
504mod tests {
505    use super::*;
506
507    fn url() -> Url {
508        Url::parse("https://example.com/article").unwrap()
509    }
510
511    /// A small, real-world looking page rendered from a README-style
512    /// markdown source. Should score Clean.
513    fn clean_readme_page() -> String {
514        let mut html = String::from(
515            r#"<!doctype html>
516<html lang="en">
517<head>
518  <title>nab — README</title>
519  <meta charset="utf-8">
520</head>
521<body>
522  <header><a href="/">Home</a> <a href="/docs">Docs</a> <a href="/changelog">Changelog</a></header>
523  <article>
524    <h1>nab — README</h1>
525    <p>nab is a small command-line HTTP client that turns any URL into clean
526    markdown for LLM context windows. It supports cookies, 1Password, HTTP/3,
527    and a labyrinth detector that protects scrapers from Cloudflare's bot trap.</p>
528    <p>Install with cargo install nab. The README explains every flag in detail.
529    Most users only need <code>nab fetch URL</code>; everything else is opt-in.</p>
530    <p>nab speaks markdown so an LLM can read its output without wading through
531    HTML chrome. The readability extractor keeps the article body and discards
532    navigation, ads, and footers.</p>
533    <p>Building from source requires Rust 1.93. Run cargo test to execute the
534    suite, which is fast: most tests finish in milliseconds.</p>
535    <p>Bug reports are welcome on GitHub. Please attach a minimal reproduction.</p>
536  </article>
537  <footer><a href="/about">About</a></footer>
538</body>
539</html>"#,
540        );
541        // pad with realistic prose so length checks fire correctly
542        for _ in 0..3 {
543            html.push_str(
544                r"<p>nab is honest about its limitations. It will not log in for you,
545will not solve CAPTCHAs, and will not pretend to be a browser it isn't.
546The goal is reliable text, not stealth. Yes, that means some sites refuse
547to serve us. We accept that trade-off.</p>",
548            );
549        }
550        html
551    }
552
553    /// Synthetic Cloudflare-style labyrinth: many hidden links, noindex,
554    /// uniform AI-flavored sentences, slug-like fanout.
555    fn synthetic_trap_page() -> String {
556        let mut html = String::from(
557            r#"<!doctype html>
558<html><head>
559<title>Curated Hub</title>
560<meta name="robots" content="noindex, nofollow">
561</head><body>
562<h1>Knowledge Index</h1>
563"#,
564        );
565        // 50 hidden anchors with slug-like targets.
566        for i in 0..50 {
567            html.push_str(&format!(
568                r#"<a href="/labyrinth/article-{i}-kepler-9b" style="display:none">Article {i} on Kepler 9b</a>
569"#,
570            ));
571        }
572        // 6 visible (still slug-like) anchors so total > hidden_count is ~88%.
573        for i in 0..6 {
574            html.push_str(&format!(
575                r#"<a href="/labyrinth/topic-{i}-quantum-3a">Topic {i}</a>
576"#,
577            ));
578        }
579        // Uniform AI-flavored prose, low burstiness, > 2000 chars,
580        // and ZERO overlap with the title "Curated Hub".
581        for _ in 0..30 {
582            html.push_str(
583                "The orbit measured exactly forty seven minutes around the dwarf companion. \
584                 Astronomers logged precisely twelve transits during the seasonal window. \
585                 Spectral lines indicated roughly six different volatile compounds present. \
586                 Photometric variation peaked around eighteen percent across the cycle. \
587                 Researchers documented carefully nine candidate planetary signatures here. ",
588            );
589        }
590        html.push_str("</body></html>");
591        html
592    }
593
594    /// Two of five signals firing — should land in Suspicious.
595    fn partial_match_page() -> String {
596        let mut html = String::from(
597            r#"<!doctype html>
598<html><head><title>Acme Login</title>
599<meta name="robots" content="noindex">
600</head><body>"#,
601        );
602        // Trigger meta_directive (noindex + rich body) and topic_drift
603        // (title says "Acme Login", body talks about astronomy).
604        for _ in 0..40 {
605            html.push_str(
606                "<p>The orbit measured forty seven minutes around the dwarf companion star. \
607                 Spectral lines indicated multiple different volatile compounds present today. \
608                 Photometric variation peaked at eighteen percent across the observation cycle.</p>",
609            );
610        }
611        // A normal, mostly visible link set.
612        for i in 0..5 {
613            html.push_str(&format!(r#"<a href="/page{i}">Page {i}</a>"#));
614        }
615        html.push_str("</body></html>");
616        html
617    }
618
619    /// Legitimate page with a couple of `display:none` cookie banner
620    /// anchors. Must NOT trip the detector.
621    fn legitimate_cookie_banner_page() -> String {
622        let mut html = String::from(
623            r#"<!doctype html>
624<html><head><title>Acme Blog — How we built nab</title></head>
625<body>
626<nav>
627  <a href="/">Home</a>
628  <a href="/blog">Blog</a>
629  <a href="/about">About</a>
630</nav>
631<div class="cookie-banner">
632  <a href="/cookies" style="display:none">Cookie policy</a>
633  <a href="/privacy" style="display:none">Privacy policy</a>
634</div>
635<article>
636<h1>How we built nab</h1>
637"#,
638        );
639        for _ in 0..5 {
640            html.push_str(
641                "<p>We built nab because the existing tools were either too heavy or too \
642                 fragile. We wanted a single binary that could fetch any URL, follow \
643                 redirects sanely, and give us back clean markdown. The first prototype \
644                 took a weekend. Most of the work after that was removing things.</p>
645                 <p>Honesty is the design principle. nab does not pretend to be a browser \
646                 it isn't. nab does not log in for you. nab does not solve CAPTCHAs.</p>",
647            );
648        }
649        html.push_str("</article></body></html>");
650        html
651    }
652
653    #[test]
654    fn clean_real_world_page_scores_below_30() {
655        let html = clean_readme_page();
656        let score = detect_labyrinth(&html, &url());
657        eprintln!("clean: total={} signals={:?}", score.total, score.signals);
658        assert!(
659            score.total < SUSPICIOUS_THRESHOLD,
660            "clean page scored {} (>= {SUSPICIOUS_THRESHOLD})",
661            score.total
662        );
663        assert_eq!(score.verdict, Verdict::Clean);
664    }
665
666    #[test]
667    fn synthetic_labyrinth_scores_above_60() {
668        let html = synthetic_trap_page();
669        let score = detect_labyrinth(&html, &url());
670        eprintln!("trap: total={} signals={:?}", score.total, score.signals);
671        assert!(
672            score.total >= TRAP_THRESHOLD,
673            "trap page only scored {} (< {TRAP_THRESHOLD})",
674            score.total
675        );
676        assert_eq!(score.verdict, Verdict::Trap);
677    }
678
679    #[test]
680    fn partial_match_is_suspicious() {
681        let html = partial_match_page();
682        let score = detect_labyrinth(&html, &url());
683        eprintln!("partial: total={} signals={:?}", score.total, score.signals);
684        assert_eq!(
685            score.verdict,
686            Verdict::Suspicious,
687            "expected Suspicious, got {:?} (total={})",
688            score.verdict,
689            score.total
690        );
691    }
692
693    #[test]
694    fn legitimate_cookie_banner_is_clean() {
695        let html = legitimate_cookie_banner_page();
696        let score = detect_labyrinth(&html, &url());
697        eprintln!(
698            "cookie banner: total={} signals={:?}",
699            score.total, score.signals
700        );
701        assert_eq!(score.verdict, Verdict::Clean);
702    }
703
704    #[test]
705    fn empty_page_does_not_panic() {
706        let score = detect_labyrinth("", &url());
707        assert_eq!(score.verdict, Verdict::Clean);
708        assert_eq!(score.signals.len(), 5);
709    }
710
711    #[test]
712    fn looks_hidden_recognises_common_patterns() {
713        assert!(looks_hidden(Some("display: none"), None, false));
714        assert!(looks_hidden(Some("DISPLAY:NONE;"), None, false));
715        assert!(looks_hidden(Some("visibility: hidden"), None, false));
716        assert!(looks_hidden(Some("opacity: 0"), None, false));
717        assert!(looks_hidden(None, Some("true"), false));
718        assert!(looks_hidden(None, None, true));
719        assert!(!looks_hidden(Some("color: red"), None, false));
720        assert!(!looks_hidden(None, Some("false"), false));
721        assert!(!looks_hidden(None, None, false));
722    }
723
724    #[test]
725    fn fixture_file_scores_as_trap() {
726        let html = std::fs::read_to_string(concat!(
727            env!("CARGO_MANIFEST_DIR"),
728            "/tests/fixtures/labyrinth_sample.html"
729        ))
730        .expect("fixture file present");
731        let score = detect_labyrinth(&html, &url());
732        eprintln!("fixture: total={} signals={:?}", score.total, score.signals);
733        assert!(score.is_trap(), "fixture should classify as Trap");
734    }
735}