1use std::collections::HashSet;
52
53use scraper::{Html, Selector};
54use std::sync::LazyLock;
55use url::Url;
56
57const SUSPICIOUS_THRESHOLD: f32 = 30.0;
64const TRAP_THRESHOLD: f32 = 60.0;
65
66static A_SELECTOR: LazyLock<Selector> =
69 LazyLock::new(|| Selector::parse("a").expect("static <a> selector"));
70static TITLE_SELECTOR: LazyLock<Selector> =
71 LazyLock::new(|| Selector::parse("title").expect("static <title> selector"));
72static BODY_SELECTOR: LazyLock<Selector> =
73 LazyLock::new(|| Selector::parse("body").expect("static <body> selector"));
74static META_ROBOTS_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
75 Selector::parse(r#"meta[name="robots"], meta[name="ROBOTS"]"#)
76 .expect("static meta robots selector")
77});
78
79#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub enum Verdict {
84 Clean,
86 Suspicious,
89 Trap,
92}
93
94#[derive(Debug, Clone, PartialEq)]
96pub struct Signal {
97 pub name: &'static str,
99 pub value: f32,
101 pub score: f32,
103 pub detail: String,
105}
106
107#[derive(Debug, Clone)]
109pub struct LabyrinthScore {
110 pub total: f32,
112 pub signals: Vec<Signal>,
114 pub verdict: Verdict,
116}
117
118impl LabyrinthScore {
119 #[must_use]
121 pub fn is_trap(&self) -> bool {
122 matches!(self.verdict, Verdict::Trap)
123 }
124
125 #[must_use]
127 pub fn is_suspicious(&self) -> bool {
128 matches!(self.verdict, Verdict::Suspicious | Verdict::Trap)
129 }
130}
131
132#[must_use]
144pub fn detect_labyrinth(html: &str, url: &Url) -> LabyrinthScore {
145 let doc = Html::parse_document(html);
146
147 let signals = vec![
148 score_hidden_link_density(&doc),
149 score_topic_drift(&doc),
150 score_meta_directive(&doc),
151 score_text_structure(&doc),
152 score_link_graph_fanout(&doc, url),
153 ];
154
155 let total: f32 = signals.iter().map(|s| s.score).sum();
156 let verdict = classify(total);
157
158 LabyrinthScore {
159 total,
160 signals,
161 verdict,
162 }
163}
164
165fn classify(total: f32) -> Verdict {
166 if total >= TRAP_THRESHOLD {
167 Verdict::Trap
168 } else if total >= SUSPICIOUS_THRESHOLD {
169 Verdict::Suspicious
170 } else {
171 Verdict::Clean
172 }
173}
174
175fn looks_hidden(style: Option<&str>, aria_hidden: Option<&str>, hidden_attr: bool) -> bool {
182 if hidden_attr {
183 return true;
184 }
185 if matches!(aria_hidden, Some(v) if v.eq_ignore_ascii_case("true")) {
186 return true;
187 }
188 let Some(style) = style else { return false };
189 let s: String = style.chars().filter(|c| !c.is_whitespace()).collect();
190 let s = s.to_ascii_lowercase();
191 s.contains("display:none")
192 || s.contains("visibility:hidden")
193 || s.contains("opacity:0")
194 || s.contains("opacity:.0")
196}
197
198fn score_hidden_link_density(doc: &Html) -> Signal {
199 let mut total = 0u32;
200 let mut hidden = 0u32;
201
202 for a in doc.select(&A_SELECTOR) {
203 total += 1;
204 let el = a.value();
205 if looks_hidden(
206 el.attr("style"),
207 el.attr("aria-hidden"),
208 el.attr("hidden").is_some(),
209 ) {
210 hidden += 1;
211 }
212 }
213
214 let ratio = if total == 0 {
215 0.0
216 } else {
217 f32::from(u16::try_from(hidden).unwrap_or(u16::MAX))
218 / f32::from(u16::try_from(total).unwrap_or(u16::MAX))
219 };
220
221 let score = if total < 10 || hidden < 5 {
230 0.0
232 } else if ratio >= 0.30 {
233 40.0
234 } else if ratio >= 0.10 {
235 25.0
236 } else if ratio >= 0.02 {
237 8.0
238 } else {
239 0.0
240 };
241
242 Signal {
243 name: "hidden_link_density",
244 value: ratio,
245 score,
246 detail: format!("{hidden}/{total} links hidden ({:.1}%)", ratio * 100.0),
247 }
248}
249
250fn tokenize(s: &str) -> HashSet<String> {
257 s.split(|c: char| !c.is_alphanumeric())
258 .filter(|w| w.len() >= 4)
259 .map(str::to_ascii_lowercase)
260 .filter(|w| !STOPWORDS.contains(&w.as_str()))
261 .collect()
262}
263
264const STOPWORDS: &[&str] = &[
265 "this", "that", "with", "from", "have", "been", "were", "they", "their", "there", "which",
266 "about", "would", "could", "should", "into", "more", "than", "then", "what", "when", "your",
267 "will", "also", "such", "some", "other", "these", "those", "page", "site",
268];
269
270fn score_topic_drift(doc: &Html) -> Signal {
271 let title_text: String = doc
272 .select(&TITLE_SELECTOR)
273 .next()
274 .map(|t| t.text().collect::<String>())
275 .unwrap_or_default();
276
277 let body_text: String = doc
278 .select(&BODY_SELECTOR)
279 .next()
280 .map(|b| b.text().collect::<Vec<_>>().join(" "))
281 .unwrap_or_default();
282
283 let body_sample: String = body_text.chars().take(500).collect();
284
285 let title_tokens = tokenize(&title_text);
286 let body_tokens = tokenize(&body_sample);
287
288 if title_tokens.is_empty() || body_tokens.is_empty() {
289 return Signal {
291 name: "topic_drift",
292 value: 0.0,
293 score: 0.0,
294 detail: "title or body too short to compare".to_string(),
295 };
296 }
297
298 let intersection = title_tokens.intersection(&body_tokens).count();
299 let union = title_tokens.union(&body_tokens).count();
300 let similarity = if union == 0 {
302 0.0
303 } else {
304 intersection as f32 / union as f32
305 };
306
307 let score = if body_text.len() < 800 {
311 0.0
312 } else if similarity < 0.05 {
313 20.0
314 } else if similarity < 0.15 {
315 8.0
316 } else {
317 0.0
318 };
319
320 Signal {
321 name: "topic_drift",
322 value: similarity,
323 score,
324 detail: format!(
325 "title∩body Jaccard={:.2} (title={}t, body={}t)",
326 similarity,
327 title_tokens.len(),
328 body_tokens.len()
329 ),
330 }
331}
332
333fn score_meta_directive(doc: &Html) -> Signal {
336 let mut noindex = false;
337 for meta in doc.select(&META_ROBOTS_SELECTOR) {
338 if let Some(content) = meta.value().attr("content")
339 && content.to_ascii_lowercase().contains("noindex")
340 {
341 noindex = true;
342 break;
343 }
344 }
345
346 let body_chars = doc
347 .select(&BODY_SELECTOR)
348 .next()
349 .map_or(0, |b| b.text().map(str::len).sum::<usize>());
350
351 let score = if noindex && body_chars > 2000 {
355 20.0
356 } else if noindex && body_chars > 800 {
357 6.0
358 } else {
359 0.0
360 };
361
362 Signal {
363 name: "meta_directive",
364 value: if noindex { 1.0 } else { 0.0 },
365 score,
366 detail: format!("noindex={noindex}, body_chars={body_chars}"),
367 }
368}
369
370fn sentence_lengths(text: &str) -> Vec<usize> {
373 let mut out = Vec::new();
374 let mut current: usize = 0;
375 for ch in text.chars() {
376 if ch == '.' || ch == '!' || ch == '?' {
377 if current >= 3 {
378 out.push(current);
379 }
380 current = 0;
381 } else if !ch.is_whitespace() {
382 current += 1;
383 } else if current > 0 {
384 current += 1;
386 }
387 }
388 if current >= 3 {
389 out.push(current);
390 }
391 out
392}
393
394fn variance(samples: &[usize]) -> f32 {
395 if samples.len() < 2 {
396 return 0.0;
397 }
398 let mean: f32 = samples.iter().map(|&n| n as f32).sum::<f32>() / samples.len() as f32;
399 let var: f32 = samples
400 .iter()
401 .map(|&n| {
402 let d = n as f32 - mean;
403 d * d
404 })
405 .sum::<f32>()
406 / samples.len() as f32;
407 var
408}
409
410fn score_text_structure(doc: &Html) -> Signal {
411 let body_text: String = doc
412 .select(&BODY_SELECTOR)
413 .next()
414 .map(|b| b.text().collect::<Vec<_>>().join(" "))
415 .unwrap_or_default();
416
417 let lengths = sentence_lengths(&body_text);
418 if lengths.len() < 8 {
419 return Signal {
420 name: "text_structure",
421 value: 0.0,
422 score: 0.0,
423 detail: format!("only {} sentences, abstaining", lengths.len()),
424 };
425 }
426 let var = variance(&lengths);
427 let mean: f32 = lengths.iter().map(|&n| n as f32).sum::<f32>() / lengths.len() as f32;
428 let cv = if mean > 0.0 { var.sqrt() / mean } else { 0.0 };
430
431 let score = if cv < 0.20 {
434 15.0
435 } else if cv < 0.30 {
436 6.0
437 } else {
438 0.0
439 };
440
441 Signal {
442 name: "text_structure",
443 value: cv,
444 score,
445 detail: format!("{} sentences, mean={mean:.1}, cv={cv:.2}", lengths.len()),
446 }
447}
448
449fn score_link_graph_fanout(doc: &Html, base: &Url) -> Signal {
452 let base_host = base.host_str().unwrap_or("");
453 let mut internal_targets: HashSet<String> = HashSet::new();
454 let mut slug_like: u32 = 0;
455
456 for a in doc.select(&A_SELECTOR) {
457 let Some(href) = a.value().attr("href") else {
458 continue;
459 };
460 let Ok(resolved) = base.join(href) else {
461 continue;
462 };
463 if resolved.host_str() != Some(base_host) || base_host.is_empty() {
464 continue;
465 }
466 let path = resolved.path().to_string();
467 if let Some(last) = path.rsplit('/').find(|s| !s.is_empty())
471 && last.len() >= 10
472 && last.chars().any(|c| c.is_ascii_digit())
473 && last.chars().any(|c| c.is_ascii_alphabetic())
474 {
475 slug_like += 1;
476 }
477 internal_targets.insert(path);
478 }
479
480 let distinct = internal_targets.len();
481 let score = if distinct >= 40 && slug_like >= 20 {
484 15.0
485 } else if distinct >= 20 && slug_like >= 10 {
486 8.0
487 } else {
488 0.0
489 };
490
491 Signal {
492 name: "link_graph_fanout",
493 value: distinct as f32,
494 score,
495 detail: format!("{distinct} distinct internal targets, {slug_like} slug-like"),
496 }
497}
498
499#[cfg(test)]
503#[allow(clippy::format_push_string)] mod tests {
505 use super::*;
506
507 fn url() -> Url {
508 Url::parse("https://example.com/article").unwrap()
509 }
510
511 fn clean_readme_page() -> String {
514 let mut html = String::from(
515 r#"<!doctype html>
516<html lang="en">
517<head>
518 <title>nab — README</title>
519 <meta charset="utf-8">
520</head>
521<body>
522 <header><a href="/">Home</a> <a href="/docs">Docs</a> <a href="/changelog">Changelog</a></header>
523 <article>
524 <h1>nab — README</h1>
525 <p>nab is a small command-line HTTP client that turns any URL into clean
526 markdown for LLM context windows. It supports cookies, 1Password, HTTP/3,
527 and a labyrinth detector that protects scrapers from Cloudflare's bot trap.</p>
528 <p>Install with cargo install nab. The README explains every flag in detail.
529 Most users only need <code>nab fetch URL</code>; everything else is opt-in.</p>
530 <p>nab speaks markdown so an LLM can read its output without wading through
531 HTML chrome. The readability extractor keeps the article body and discards
532 navigation, ads, and footers.</p>
533 <p>Building from source requires Rust 1.93. Run cargo test to execute the
534 suite, which is fast: most tests finish in milliseconds.</p>
535 <p>Bug reports are welcome on GitHub. Please attach a minimal reproduction.</p>
536 </article>
537 <footer><a href="/about">About</a></footer>
538</body>
539</html>"#,
540 );
541 for _ in 0..3 {
543 html.push_str(
544 r"<p>nab is honest about its limitations. It will not log in for you,
545will not solve CAPTCHAs, and will not pretend to be a browser it isn't.
546The goal is reliable text, not stealth. Yes, that means some sites refuse
547to serve us. We accept that trade-off.</p>",
548 );
549 }
550 html
551 }
552
553 fn synthetic_trap_page() -> String {
556 let mut html = String::from(
557 r#"<!doctype html>
558<html><head>
559<title>Curated Hub</title>
560<meta name="robots" content="noindex, nofollow">
561</head><body>
562<h1>Knowledge Index</h1>
563"#,
564 );
565 for i in 0..50 {
567 html.push_str(&format!(
568 r#"<a href="/labyrinth/article-{i}-kepler-9b" style="display:none">Article {i} on Kepler 9b</a>
569"#,
570 ));
571 }
572 for i in 0..6 {
574 html.push_str(&format!(
575 r#"<a href="/labyrinth/topic-{i}-quantum-3a">Topic {i}</a>
576"#,
577 ));
578 }
579 for _ in 0..30 {
582 html.push_str(
583 "The orbit measured exactly forty seven minutes around the dwarf companion. \
584 Astronomers logged precisely twelve transits during the seasonal window. \
585 Spectral lines indicated roughly six different volatile compounds present. \
586 Photometric variation peaked around eighteen percent across the cycle. \
587 Researchers documented carefully nine candidate planetary signatures here. ",
588 );
589 }
590 html.push_str("</body></html>");
591 html
592 }
593
594 fn partial_match_page() -> String {
596 let mut html = String::from(
597 r#"<!doctype html>
598<html><head><title>Acme Login</title>
599<meta name="robots" content="noindex">
600</head><body>"#,
601 );
602 for _ in 0..40 {
605 html.push_str(
606 "<p>The orbit measured forty seven minutes around the dwarf companion star. \
607 Spectral lines indicated multiple different volatile compounds present today. \
608 Photometric variation peaked at eighteen percent across the observation cycle.</p>",
609 );
610 }
611 for i in 0..5 {
613 html.push_str(&format!(r#"<a href="/page{i}">Page {i}</a>"#));
614 }
615 html.push_str("</body></html>");
616 html
617 }
618
619 fn legitimate_cookie_banner_page() -> String {
622 let mut html = String::from(
623 r#"<!doctype html>
624<html><head><title>Acme Blog — How we built nab</title></head>
625<body>
626<nav>
627 <a href="/">Home</a>
628 <a href="/blog">Blog</a>
629 <a href="/about">About</a>
630</nav>
631<div class="cookie-banner">
632 <a href="/cookies" style="display:none">Cookie policy</a>
633 <a href="/privacy" style="display:none">Privacy policy</a>
634</div>
635<article>
636<h1>How we built nab</h1>
637"#,
638 );
639 for _ in 0..5 {
640 html.push_str(
641 "<p>We built nab because the existing tools were either too heavy or too \
642 fragile. We wanted a single binary that could fetch any URL, follow \
643 redirects sanely, and give us back clean markdown. The first prototype \
644 took a weekend. Most of the work after that was removing things.</p>
645 <p>Honesty is the design principle. nab does not pretend to be a browser \
646 it isn't. nab does not log in for you. nab does not solve CAPTCHAs.</p>",
647 );
648 }
649 html.push_str("</article></body></html>");
650 html
651 }
652
653 #[test]
654 fn clean_real_world_page_scores_below_30() {
655 let html = clean_readme_page();
656 let score = detect_labyrinth(&html, &url());
657 eprintln!("clean: total={} signals={:?}", score.total, score.signals);
658 assert!(
659 score.total < SUSPICIOUS_THRESHOLD,
660 "clean page scored {} (>= {SUSPICIOUS_THRESHOLD})",
661 score.total
662 );
663 assert_eq!(score.verdict, Verdict::Clean);
664 }
665
666 #[test]
667 fn synthetic_labyrinth_scores_above_60() {
668 let html = synthetic_trap_page();
669 let score = detect_labyrinth(&html, &url());
670 eprintln!("trap: total={} signals={:?}", score.total, score.signals);
671 assert!(
672 score.total >= TRAP_THRESHOLD,
673 "trap page only scored {} (< {TRAP_THRESHOLD})",
674 score.total
675 );
676 assert_eq!(score.verdict, Verdict::Trap);
677 }
678
679 #[test]
680 fn partial_match_is_suspicious() {
681 let html = partial_match_page();
682 let score = detect_labyrinth(&html, &url());
683 eprintln!("partial: total={} signals={:?}", score.total, score.signals);
684 assert_eq!(
685 score.verdict,
686 Verdict::Suspicious,
687 "expected Suspicious, got {:?} (total={})",
688 score.verdict,
689 score.total
690 );
691 }
692
693 #[test]
694 fn legitimate_cookie_banner_is_clean() {
695 let html = legitimate_cookie_banner_page();
696 let score = detect_labyrinth(&html, &url());
697 eprintln!(
698 "cookie banner: total={} signals={:?}",
699 score.total, score.signals
700 );
701 assert_eq!(score.verdict, Verdict::Clean);
702 }
703
704 #[test]
705 fn empty_page_does_not_panic() {
706 let score = detect_labyrinth("", &url());
707 assert_eq!(score.verdict, Verdict::Clean);
708 assert_eq!(score.signals.len(), 5);
709 }
710
711 #[test]
712 fn looks_hidden_recognises_common_patterns() {
713 assert!(looks_hidden(Some("display: none"), None, false));
714 assert!(looks_hidden(Some("DISPLAY:NONE;"), None, false));
715 assert!(looks_hidden(Some("visibility: hidden"), None, false));
716 assert!(looks_hidden(Some("opacity: 0"), None, false));
717 assert!(looks_hidden(None, Some("true"), false));
718 assert!(looks_hidden(None, None, true));
719 assert!(!looks_hidden(Some("color: red"), None, false));
720 assert!(!looks_hidden(None, Some("false"), false));
721 assert!(!looks_hidden(None, None, false));
722 }
723
724 #[test]
725 fn fixture_file_scores_as_trap() {
726 let html = std::fs::read_to_string(concat!(
727 env!("CARGO_MANIFEST_DIR"),
728 "/tests/fixtures/labyrinth_sample.html"
729 ))
730 .expect("fixture file present");
731 let score = detect_labyrinth(&html, &url());
732 eprintln!("fixture: total={} signals={:?}", score.total, score.signals);
733 assert!(score.is_trap(), "fixture should classify as Trap");
734 }
735}