use std::collections::HashSet;
use scraper::{Html, Selector};
use std::sync::LazyLock;
use url::Url;
const SUSPICIOUS_THRESHOLD: f32 = 30.0;
const TRAP_THRESHOLD: f32 = 60.0;
static A_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("a").expect("static <a> selector"));
static TITLE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("title").expect("static <title> selector"));
static BODY_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("body").expect("static <body> selector"));
static META_ROBOTS_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse(r#"meta[name="robots"], meta[name="ROBOTS"]"#)
.expect("static meta robots selector")
});
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Verdict {
Clean,
Suspicious,
Trap,
}
#[derive(Debug, Clone, PartialEq)]
pub struct Signal {
pub name: &'static str,
pub value: f32,
pub score: f32,
pub detail: String,
}
#[derive(Debug, Clone)]
pub struct LabyrinthScore {
pub total: f32,
pub signals: Vec<Signal>,
pub verdict: Verdict,
}
impl LabyrinthScore {
#[must_use]
pub fn is_trap(&self) -> bool {
matches!(self.verdict, Verdict::Trap)
}
#[must_use]
pub fn is_suspicious(&self) -> bool {
matches!(self.verdict, Verdict::Suspicious | Verdict::Trap)
}
}
#[must_use]
pub fn detect_labyrinth(html: &str, url: &Url) -> LabyrinthScore {
let doc = Html::parse_document(html);
let signals = vec![
score_hidden_link_density(&doc),
score_topic_drift(&doc),
score_meta_directive(&doc),
score_text_structure(&doc),
score_link_graph_fanout(&doc, url),
];
let total: f32 = signals.iter().map(|s| s.score).sum();
let verdict = classify(total);
LabyrinthScore {
total,
signals,
verdict,
}
}
fn classify(total: f32) -> Verdict {
if total >= TRAP_THRESHOLD {
Verdict::Trap
} else if total >= SUSPICIOUS_THRESHOLD {
Verdict::Suspicious
} else {
Verdict::Clean
}
}
fn looks_hidden(style: Option<&str>, aria_hidden: Option<&str>, hidden_attr: bool) -> bool {
if hidden_attr {
return true;
}
if matches!(aria_hidden, Some(v) if v.eq_ignore_ascii_case("true")) {
return true;
}
let Some(style) = style else { return false };
let s: String = style.chars().filter(|c| !c.is_whitespace()).collect();
let s = s.to_ascii_lowercase();
s.contains("display:none")
|| s.contains("visibility:hidden")
|| s.contains("opacity:0")
|| s.contains("opacity:.0")
}
fn score_hidden_link_density(doc: &Html) -> Signal {
let mut total = 0u32;
let mut hidden = 0u32;
for a in doc.select(&A_SELECTOR) {
total += 1;
let el = a.value();
if looks_hidden(
el.attr("style"),
el.attr("aria-hidden"),
el.attr("hidden").is_some(),
) {
hidden += 1;
}
}
let ratio = if total == 0 {
0.0
} else {
f32::from(u16::try_from(hidden).unwrap_or(u16::MAX))
/ f32::from(u16::try_from(total).unwrap_or(u16::MAX))
};
let score = if total < 10 || hidden < 5 {
0.0
} else if ratio >= 0.30 {
40.0
} else if ratio >= 0.10 {
25.0
} else if ratio >= 0.02 {
8.0
} else {
0.0
};
Signal {
name: "hidden_link_density",
value: ratio,
score,
detail: format!("{hidden}/{total} links hidden ({:.1}%)", ratio * 100.0),
}
}
fn tokenize(s: &str) -> HashSet<String> {
s.split(|c: char| !c.is_alphanumeric())
.filter(|w| w.len() >= 4)
.map(str::to_ascii_lowercase)
.filter(|w| !STOPWORDS.contains(&w.as_str()))
.collect()
}
const STOPWORDS: &[&str] = &[
"this", "that", "with", "from", "have", "been", "were", "they", "their", "there", "which",
"about", "would", "could", "should", "into", "more", "than", "then", "what", "when", "your",
"will", "also", "such", "some", "other", "these", "those", "page", "site",
];
fn score_topic_drift(doc: &Html) -> Signal {
let title_text: String = doc
.select(&TITLE_SELECTOR)
.next()
.map(|t| t.text().collect::<String>())
.unwrap_or_default();
let body_text: String = doc
.select(&BODY_SELECTOR)
.next()
.map(|b| b.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default();
let body_sample: String = body_text.chars().take(500).collect();
let title_tokens = tokenize(&title_text);
let body_tokens = tokenize(&body_sample);
if title_tokens.is_empty() || body_tokens.is_empty() {
return Signal {
name: "topic_drift",
value: 0.0,
score: 0.0,
detail: "title or body too short to compare".to_string(),
};
}
let intersection = title_tokens.intersection(&body_tokens).count();
let union = title_tokens.union(&body_tokens).count();
let similarity = if union == 0 {
0.0
} else {
intersection as f32 / union as f32
};
let score = if body_text.len() < 800 {
0.0
} else if similarity < 0.05 {
20.0
} else if similarity < 0.15 {
8.0
} else {
0.0
};
Signal {
name: "topic_drift",
value: similarity,
score,
detail: format!(
"title∩body Jaccard={:.2} (title={}t, body={}t)",
similarity,
title_tokens.len(),
body_tokens.len()
),
}
}
fn score_meta_directive(doc: &Html) -> Signal {
let mut noindex = false;
for meta in doc.select(&META_ROBOTS_SELECTOR) {
if let Some(content) = meta.value().attr("content")
&& content.to_ascii_lowercase().contains("noindex")
{
noindex = true;
break;
}
}
let body_chars = doc
.select(&BODY_SELECTOR)
.next()
.map_or(0, |b| b.text().map(str::len).sum::<usize>());
let score = if noindex && body_chars > 2000 {
20.0
} else if noindex && body_chars > 800 {
6.0
} else {
0.0
};
Signal {
name: "meta_directive",
value: if noindex { 1.0 } else { 0.0 },
score,
detail: format!("noindex={noindex}, body_chars={body_chars}"),
}
}
fn sentence_lengths(text: &str) -> Vec<usize> {
let mut out = Vec::new();
let mut current: usize = 0;
for ch in text.chars() {
if ch == '.' || ch == '!' || ch == '?' {
if current >= 3 {
out.push(current);
}
current = 0;
} else if !ch.is_whitespace() {
current += 1;
} else if current > 0 {
current += 1;
}
}
if current >= 3 {
out.push(current);
}
out
}
fn variance(samples: &[usize]) -> f32 {
if samples.len() < 2 {
return 0.0;
}
let mean: f32 = samples.iter().map(|&n| n as f32).sum::<f32>() / samples.len() as f32;
let var: f32 = samples
.iter()
.map(|&n| {
let d = n as f32 - mean;
d * d
})
.sum::<f32>()
/ samples.len() as f32;
var
}
fn score_text_structure(doc: &Html) -> Signal {
let body_text: String = doc
.select(&BODY_SELECTOR)
.next()
.map(|b| b.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default();
let lengths = sentence_lengths(&body_text);
if lengths.len() < 8 {
return Signal {
name: "text_structure",
value: 0.0,
score: 0.0,
detail: format!("only {} sentences, abstaining", lengths.len()),
};
}
let var = variance(&lengths);
let mean: f32 = lengths.iter().map(|&n| n as f32).sum::<f32>() / lengths.len() as f32;
let cv = if mean > 0.0 { var.sqrt() / mean } else { 0.0 };
let score = if cv < 0.20 {
15.0
} else if cv < 0.30 {
6.0
} else {
0.0
};
Signal {
name: "text_structure",
value: cv,
score,
detail: format!("{} sentences, mean={mean:.1}, cv={cv:.2}", lengths.len()),
}
}
fn score_link_graph_fanout(doc: &Html, base: &Url) -> Signal {
let base_host = base.host_str().unwrap_or("");
let mut internal_targets: HashSet<String> = HashSet::new();
let mut slug_like: u32 = 0;
for a in doc.select(&A_SELECTOR) {
let Some(href) = a.value().attr("href") else {
continue;
};
let Ok(resolved) = base.join(href) else {
continue;
};
if resolved.host_str() != Some(base_host) || base_host.is_empty() {
continue;
}
let path = resolved.path().to_string();
if let Some(last) = path.rsplit('/').find(|s| !s.is_empty())
&& last.len() >= 10
&& last.chars().any(|c| c.is_ascii_digit())
&& last.chars().any(|c| c.is_ascii_alphabetic())
{
slug_like += 1;
}
internal_targets.insert(path);
}
let distinct = internal_targets.len();
let score = if distinct >= 40 && slug_like >= 20 {
15.0
} else if distinct >= 20 && slug_like >= 10 {
8.0
} else {
0.0
};
Signal {
name: "link_graph_fanout",
value: distinct as f32,
score,
detail: format!("{distinct} distinct internal targets, {slug_like} slug-like"),
}
}
#[cfg(test)]
#[allow(clippy::format_push_string)] mod tests {
use super::*;
fn url() -> Url {
Url::parse("https://example.com/article").unwrap()
}
fn clean_readme_page() -> String {
let mut html = String::from(
r#"<!doctype html>
<html lang="en">
<head>
<title>nab — README</title>
<meta charset="utf-8">
</head>
<body>
<header><a href="/">Home</a> <a href="/docs">Docs</a> <a href="/changelog">Changelog</a></header>
<article>
<h1>nab — README</h1>
<p>nab is a small command-line HTTP client that turns any URL into clean
markdown for LLM context windows. It supports cookies, 1Password, HTTP/3,
and a labyrinth detector that protects scrapers from Cloudflare's bot trap.</p>
<p>Install with cargo install nab. The README explains every flag in detail.
Most users only need <code>nab fetch URL</code>; everything else is opt-in.</p>
<p>nab speaks markdown so an LLM can read its output without wading through
HTML chrome. The readability extractor keeps the article body and discards
navigation, ads, and footers.</p>
<p>Building from source requires Rust 1.93. Run cargo test to execute the
suite, which is fast: most tests finish in milliseconds.</p>
<p>Bug reports are welcome on GitHub. Please attach a minimal reproduction.</p>
</article>
<footer><a href="/about">About</a></footer>
</body>
</html>"#,
);
for _ in 0..3 {
html.push_str(
r"<p>nab is honest about its limitations. It will not log in for you,
will not solve CAPTCHAs, and will not pretend to be a browser it isn't.
The goal is reliable text, not stealth. Yes, that means some sites refuse
to serve us. We accept that trade-off.</p>",
);
}
html
}
fn synthetic_trap_page() -> String {
let mut html = String::from(
r#"<!doctype html>
<html><head>
<title>Curated Hub</title>
<meta name="robots" content="noindex, nofollow">
</head><body>
<h1>Knowledge Index</h1>
"#,
);
for i in 0..50 {
html.push_str(&format!(
r#"<a href="/labyrinth/article-{i}-kepler-9b" style="display:none">Article {i} on Kepler 9b</a>
"#,
));
}
for i in 0..6 {
html.push_str(&format!(
r#"<a href="/labyrinth/topic-{i}-quantum-3a">Topic {i}</a>
"#,
));
}
for _ in 0..30 {
html.push_str(
"The orbit measured exactly forty seven minutes around the dwarf companion. \
Astronomers logged precisely twelve transits during the seasonal window. \
Spectral lines indicated roughly six different volatile compounds present. \
Photometric variation peaked around eighteen percent across the cycle. \
Researchers documented carefully nine candidate planetary signatures here. ",
);
}
html.push_str("</body></html>");
html
}
fn partial_match_page() -> String {
let mut html = String::from(
r#"<!doctype html>
<html><head><title>Acme Login</title>
<meta name="robots" content="noindex">
</head><body>"#,
);
for _ in 0..40 {
html.push_str(
"<p>The orbit measured forty seven minutes around the dwarf companion star. \
Spectral lines indicated multiple different volatile compounds present today. \
Photometric variation peaked at eighteen percent across the observation cycle.</p>",
);
}
for i in 0..5 {
html.push_str(&format!(r#"<a href="/page{i}">Page {i}</a>"#));
}
html.push_str("</body></html>");
html
}
fn legitimate_cookie_banner_page() -> String {
let mut html = String::from(
r#"<!doctype html>
<html><head><title>Acme Blog — How we built nab</title></head>
<body>
<nav>
<a href="/">Home</a>
<a href="/blog">Blog</a>
<a href="/about">About</a>
</nav>
<div class="cookie-banner">
<a href="/cookies" style="display:none">Cookie policy</a>
<a href="/privacy" style="display:none">Privacy policy</a>
</div>
<article>
<h1>How we built nab</h1>
"#,
);
for _ in 0..5 {
html.push_str(
"<p>We built nab because the existing tools were either too heavy or too \
fragile. We wanted a single binary that could fetch any URL, follow \
redirects sanely, and give us back clean markdown. The first prototype \
took a weekend. Most of the work after that was removing things.</p>
<p>Honesty is the design principle. nab does not pretend to be a browser \
it isn't. nab does not log in for you. nab does not solve CAPTCHAs.</p>",
);
}
html.push_str("</article></body></html>");
html
}
#[test]
fn clean_real_world_page_scores_below_30() {
let html = clean_readme_page();
let score = detect_labyrinth(&html, &url());
eprintln!("clean: total={} signals={:?}", score.total, score.signals);
assert!(
score.total < SUSPICIOUS_THRESHOLD,
"clean page scored {} (>= {SUSPICIOUS_THRESHOLD})",
score.total
);
assert_eq!(score.verdict, Verdict::Clean);
}
#[test]
fn synthetic_labyrinth_scores_above_60() {
let html = synthetic_trap_page();
let score = detect_labyrinth(&html, &url());
eprintln!("trap: total={} signals={:?}", score.total, score.signals);
assert!(
score.total >= TRAP_THRESHOLD,
"trap page only scored {} (< {TRAP_THRESHOLD})",
score.total
);
assert_eq!(score.verdict, Verdict::Trap);
}
#[test]
fn partial_match_is_suspicious() {
let html = partial_match_page();
let score = detect_labyrinth(&html, &url());
eprintln!("partial: total={} signals={:?}", score.total, score.signals);
assert_eq!(
score.verdict,
Verdict::Suspicious,
"expected Suspicious, got {:?} (total={})",
score.verdict,
score.total
);
}
#[test]
fn legitimate_cookie_banner_is_clean() {
let html = legitimate_cookie_banner_page();
let score = detect_labyrinth(&html, &url());
eprintln!(
"cookie banner: total={} signals={:?}",
score.total, score.signals
);
assert_eq!(score.verdict, Verdict::Clean);
}
#[test]
fn empty_page_does_not_panic() {
let score = detect_labyrinth("", &url());
assert_eq!(score.verdict, Verdict::Clean);
assert_eq!(score.signals.len(), 5);
}
#[test]
fn looks_hidden_recognises_common_patterns() {
assert!(looks_hidden(Some("display: none"), None, false));
assert!(looks_hidden(Some("DISPLAY:NONE;"), None, false));
assert!(looks_hidden(Some("visibility: hidden"), None, false));
assert!(looks_hidden(Some("opacity: 0"), None, false));
assert!(looks_hidden(None, Some("true"), false));
assert!(looks_hidden(None, None, true));
assert!(!looks_hidden(Some("color: red"), None, false));
assert!(!looks_hidden(None, Some("false"), false));
assert!(!looks_hidden(None, None, false));
}
#[test]
fn fixture_file_scores_as_trap() {
let html = std::fs::read_to_string(concat!(
env!("CARGO_MANIFEST_DIR"),
"/tests/fixtures/labyrinth_sample.html"
))
.expect("fixture file present");
let score = detect_labyrinth(&html, &url());
eprintln!("fixture: total={} signals={:?}", score.total, score.signals);
assert!(score.is_trap(), "fixture should classify as Trap");
}
}