nab/content/
response_classifier.rs

1//! Shared response classification for auth walls, browser challenges, thin
2//! shells, and other fetch-time diagnostics.
3
4use super::quality::QualityScore;
5
6/// Auth-related response class.
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum AuthRequiredKind {
9    LoginRequired,
10    SessionExpired,
11}
12
13/// Bot / browser challenge response class.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum BrowserChallengeKind {
16    Cloudflare,
17    Vercel,
18    Turnstile,
19    Captcha,
20    LinkedInBotDetection,
21    /// AWS WAF challenge — detected via `x-amzn-waf-action` header or
22    /// an `*.awswaf.com` sub-resource reference in the body.
23    AwsWaf,
24}
25
26/// High-level fetch-time response diagnosis.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum ResponseDiagnosticKind {
29    AuthRequired(AuthRequiredKind),
30    BrowserChallenge(BrowserChallengeKind),
31    RateLimited,
32}
33
34/// Classified diagnostic for an HTTP response body.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub struct ResponseDiagnostic {
37    pub kind: ResponseDiagnosticKind,
38    pub status: u16,
39}
40
41impl ResponseDiagnostic {
42    /// Stable machine-readable code for structured output.
43    #[must_use]
44    pub fn code(self) -> &'static str {
45        match self.kind {
46            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => {
47                "login_required"
48            }
49            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => {
50                "session_expired"
51            }
52            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare) => {
53                "cloudflare_challenge"
54            }
55            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel) => {
56                "vercel_challenge"
57            }
58            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile) => {
59                "turnstile_challenge"
60            }
61            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha) => {
62                "captcha_challenge"
63            }
64            ResponseDiagnosticKind::BrowserChallenge(
65                BrowserChallengeKind::LinkedInBotDetection,
66            ) => "linkedin_bot_detection",
67            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf) => {
68                "aws_waf_challenge"
69            }
70            ResponseDiagnosticKind::RateLimited => "rate_limited",
71        }
72    }
73
74    /// Human-readable summary for logs and diagnostics.
75    #[must_use]
76    pub fn summary(self) -> String {
77        match self.kind {
78            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => format!(
79                "Login wall or authenticated content detected (HTTP {}).",
80                self.status
81            ),
82            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => format!(
83                "Session appears expired or timed out (HTTP {}).",
84                self.status
85            ),
86            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare) => {
87                format!(
88                    "Cloudflare browser challenge detected (HTTP {}).",
89                    self.status
90                )
91            }
92            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel) => {
93                "Vercel Security Checkpoint detected.".to_string()
94            }
95            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile) => {
96                "Cloudflare Turnstile challenge detected.".to_string()
97            }
98            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha) => {
99                format!("CAPTCHA challenge detected (HTTP {}).", self.status)
100            }
101            ResponseDiagnosticKind::BrowserChallenge(
102                BrowserChallengeKind::LinkedInBotDetection,
103            ) => "LinkedIn bot detection (HTTP 999).".to_string(),
104            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf) => {
105                format!("AWS WAF challenge detected (HTTP {}).", self.status)
106            }
107            ResponseDiagnosticKind::RateLimited => format!(
108                "Rate limit or throttling response detected (HTTP {}).",
109                self.status
110            ),
111        }
112    }
113
114    /// Suggested remediation phrased for both CLI and MCP callers.
115    #[must_use]
116    pub fn guidance(self) -> &'static str {
117        match self.kind {
118            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => {
119                "Sign in in a browser first, then retry with the default browser cookies or a named authenticated session. If you explicitly disabled cookies, re-enable them."
120            }
121            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => {
122                "Refresh the site in a browser to renew the session, then retry with the default browser cookies or a named authenticated session."
123            }
124            ResponseDiagnosticKind::BrowserChallenge(_) => {
125                "Complete the browser challenge in a real browser first, then retry with the default browser cookies or a named session. Use an explicit browser override only if the default profile is not the authenticated one."
126            }
127            ResponseDiagnosticKind::RateLimited => {
128                "Retry later, or use an authenticated browser/session path if the site rate-limits anonymous traffic."
129            }
130        }
131    }
132
133    /// Full user-facing diagnostic message.
134    #[must_use]
135    pub fn message(self) -> String {
136        format!("{}\n{}", self.summary(), self.guidance())
137    }
138}
139
140/// Shared response classes used by fetch diagnostics and MCP tracing.
141#[derive(Debug, Clone, Copy, PartialEq, Eq)]
142pub enum ResponseClass {
143    Unauthorized,
144    LoginRequired,
145    Forbidden,
146    BotChallenge,
147    RateLimited,
148    ObfuscatedContent,
149    ThinContent,
150}
151
152impl ResponseClass {
153    /// Stable machine-readable code for structured output.
154    #[must_use]
155    pub fn code(self) -> &'static str {
156        match self {
157            Self::Unauthorized => "unauthorized",
158            Self::LoginRequired => "login_required",
159            Self::Forbidden => "forbidden",
160            Self::BotChallenge => "bot_challenge",
161            Self::RateLimited => "rate_limited",
162            Self::ObfuscatedContent => "obfuscated_content",
163            Self::ThinContent => "thin_content",
164        }
165    }
166}
167
168/// One classified response signal with a confidence estimate.
169#[derive(Debug, Clone, Copy, PartialEq)]
170pub struct ResponseSignal {
171    pub class: ResponseClass,
172    pub confidence: f32,
173    pub reason: &'static str,
174}
175
176/// Multi-signal response classification result.
177#[derive(Debug, Clone, Default, PartialEq)]
178pub struct ResponseClassification {
179    signals: Vec<ResponseSignal>,
180}
181
182impl ResponseClassification {
183    fn push(&mut self, signal: ResponseSignal) {
184        if !self.has_class(signal.class) {
185            self.signals.push(signal);
186        }
187    }
188
189    /// Highest-priority response signal.
190    #[must_use]
191    pub fn primary(&self) -> Option<&ResponseSignal> {
192        self.signals.first()
193    }
194
195    /// Whether the classification contains the given signal class.
196    #[must_use]
197    pub fn has_class(&self, class: ResponseClass) -> bool {
198        self.signals.iter().any(|signal| signal.class == class)
199    }
200}
201
202/// Inputs used for shared response classification.
203#[derive(Debug, Clone, Copy)]
204pub struct ResponseAnalysis<'a> {
205    pub status: u16,
206    pub body: &'a str,
207    pub content_type: Option<&'a str>,
208    pub html_bytes: Option<usize>,
209    pub markdown: Option<&'a str>,
210    pub markdown_chars: Option<usize>,
211    pub quality: Option<&'a QualityScore>,
212}
213
214/// Classify a raw HTTP response body into a higher-level auth/challenge signal.
215#[must_use]
216pub fn classify_http_response(status: u16, body: &str) -> Option<ResponseDiagnostic> {
217    let body_lower = body.to_lowercase();
218    classify_http_response_lower(status, &body_lower)
219}
220
221/// Classify a response using status/body plus optional HTML extraction signals.
222#[must_use]
223pub fn classify_response(analysis: ResponseAnalysis<'_>) -> ResponseClassification {
224    let body_lower = analysis.body.to_lowercase();
225    let mut classification = ResponseClassification::default();
226
227    if analysis.status == 401 {
228        classification.push(ResponseSignal {
229            class: ResponseClass::Unauthorized,
230            confidence: 0.97,
231            reason: "http 401 unauthorized response",
232        });
233    } else if let Some(diagnostic) = classify_http_response_lower(analysis.status, &body_lower) {
234        classification.push(map_diagnostic_signal(diagnostic));
235    } else if matches!(analysis.status, 403 | 999) && looks_like_forbidden(&body_lower) {
236        classification.push(ResponseSignal {
237            class: ResponseClass::Forbidden,
238            confidence: if analysis.status == 999 { 0.96 } else { 0.85 },
239            reason: if analysis.status == 999 {
240                "nonstandard anti-automation block status detected"
241            } else {
242                "forbidden or access-denied markers detected"
243            },
244        });
245    }
246
247    if let (Some(html_bytes), Some(markdown_chars)) = (analysis.html_bytes, analysis.markdown_chars)
248        && classify_thin_content(
249            analysis.content_type,
250            html_bytes,
251            markdown_chars,
252            analysis.quality,
253        )
254        .is_some()
255    {
256        let confidence = analysis.quality.map_or(0.78_f32, |quality| {
257            if quality.confidence < 0.5 {
258                0.9_f32
259            } else {
260                0.8_f32
261            }
262        });
263        classification.push(ResponseSignal {
264            class: ResponseClass::ThinContent,
265            confidence,
266            reason: "markdown output is disproportionately small relative to the HTML body",
267        });
268    }
269
270    if let Some(markdown) = analysis.markdown
271        && classify_obfuscated_content(analysis.content_type, markdown).is_some()
272    {
273        classification.push(ResponseSignal {
274            class: ResponseClass::ObfuscatedContent,
275            confidence: 0.95,
276            reason: "extracted content is dominated by a long encoded or obfuscated blob",
277        });
278    }
279
280    classification
281}
282
283/// Thin-content diagnostic payload used by CLI / MCP fetch diagnostics.
284#[derive(Debug, Clone, Copy, PartialEq, Eq)]
285pub struct ThinContentDiagnostic {
286    pub html_bytes: usize,
287    pub markdown_chars: usize,
288    pub low_confidence: bool,
289}
290
291/// Shared thin-content classifier using the same narrow thresholds as HTML
292/// extraction plus an optional low-confidence signal.
293#[must_use]
294pub fn classify_thin_content(
295    content_type: Option<&str>,
296    html_bytes: usize,
297    markdown_chars: usize,
298    quality: Option<&QualityScore>,
299) -> Option<ThinContentDiagnostic> {
300    let is_html = content_type.is_some_and(|value| value.contains("html"));
301    if !is_html {
302        return None;
303    }
304
305    if is_thin_content(html_bytes, markdown_chars) {
306        return Some(ThinContentDiagnostic {
307            html_bytes,
308            markdown_chars,
309            low_confidence: quality.is_some_and(|score| score.confidence < 0.5),
310        });
311    }
312
313    if html_bytes >= 5_000
314        && markdown_chars < 800
315        && quality.is_some_and(|score| score.confidence < 0.35)
316    {
317        return Some(ThinContentDiagnostic {
318            html_bytes,
319            markdown_chars,
320            low_confidence: true,
321        });
322    }
323
324    None
325}
326
327/// Obfuscated-content diagnostic payload used by CLI / MCP fetch diagnostics.
328#[derive(Debug, Clone, Copy, PartialEq, Eq)]
329pub struct ObfuscatedContentDiagnostic {
330    pub dominant_blob_chars: usize,
331    pub non_whitespace_chars: usize,
332    pub readable_word_count: usize,
333}
334
335/// Detect HTML extractions that are dominated by a long encoded/blob-like token
336/// rather than readable text. This catches paywall/protected pages that return a
337/// large opaque payload instead of article content.
338#[must_use]
339pub fn classify_obfuscated_content(
340    content_type: Option<&str>,
341    markdown: &str,
342) -> Option<ObfuscatedContentDiagnostic> {
343    let is_html = content_type.is_some_and(|value| value.contains("html"));
344    if !is_html {
345        return None;
346    }
347
348    let non_whitespace_chars = markdown.chars().filter(|c| !c.is_whitespace()).count();
349    if non_whitespace_chars < 2_048 {
350        return None;
351    }
352
353    let readable_word_count = markdown
354        .split_whitespace()
355        .filter(|token| looks_like_readable_word(token))
356        .take(32)
357        .count();
358    if readable_word_count >= 24 {
359        return None;
360    }
361
362    let dominant_blob_chars = markdown
363        .split_whitespace()
364        .filter_map(base64ish_blob_token_len)
365        .max()
366        .unwrap_or(0);
367    let dominant_ratio = (dominant_blob_chars * 100) / non_whitespace_chars.max(1);
368
369    if dominant_blob_chars >= 2_048 || (dominant_blob_chars >= 1_024 && dominant_ratio >= 60) {
370        return Some(ObfuscatedContentDiagnostic {
371            dominant_blob_chars,
372            non_whitespace_chars,
373            readable_word_count,
374        });
375    }
376
377    None
378}
379
380fn classify_http_response_lower(status: u16, body_lower: &str) -> Option<ResponseDiagnostic> {
381    if status == 999 {
382        return Some(ResponseDiagnostic {
383            kind: ResponseDiagnosticKind::BrowserChallenge(
384                BrowserChallengeKind::LinkedInBotDetection,
385            ),
386            status,
387        });
388    }
389
390    if looks_like_aws_waf(status, body_lower) {
391        return Some(ResponseDiagnostic {
392            kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf),
393            status,
394        });
395    }
396
397    if looks_like_turnstile(body_lower) {
398        return Some(ResponseDiagnostic {
399            kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile),
400            status,
401        });
402    }
403
404    if status == 429 && looks_like_vercel_checkpoint(body_lower) {
405        return Some(ResponseDiagnostic {
406            kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel),
407            status,
408        });
409    }
410
411    if matches!(status, 403 | 503) && looks_like_cloudflare_challenge(body_lower) {
412        return Some(ResponseDiagnostic {
413            kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare),
414            status,
415        });
416    }
417
418    if matches!(status, 403 | 429 | 503) && looks_like_captcha_interstitial(body_lower) {
419        return Some(ResponseDiagnostic {
420            kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha),
421            status,
422        });
423    }
424
425    if matches!(status, 419 | 440) || looks_like_session_expired(body_lower) {
426        return Some(ResponseDiagnostic {
427            kind: ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired),
428            status,
429        });
430    }
431
432    if status == 429 && looks_like_rate_limit(body_lower) {
433        return Some(ResponseDiagnostic {
434            kind: ResponseDiagnosticKind::RateLimited,
435            status,
436        });
437    }
438
439    if (status == 403
440        && (looks_like_login_wall(body_lower) || looks_like_password_gate(body_lower)))
441        || (looks_like_login_wall(body_lower) && looks_like_password_gate(body_lower))
442    {
443        return Some(ResponseDiagnostic {
444            kind: ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired),
445            status,
446        });
447    }
448
449    None
450}
451
452fn map_diagnostic_signal(diagnostic: ResponseDiagnostic) -> ResponseSignal {
453    match diagnostic.kind {
454        ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => ResponseSignal {
455            class: ResponseClass::LoginRequired,
456            confidence: if diagnostic.status == 200 { 0.83 } else { 0.95 },
457            reason: "login-wall markers and password-gate signals detected",
458        },
459        ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => ResponseSignal {
460            class: ResponseClass::Unauthorized,
461            confidence: 0.94,
462            reason: "session-expired markers detected",
463        },
464        ResponseDiagnosticKind::BrowserChallenge(_) => ResponseSignal {
465            class: ResponseClass::BotChallenge,
466            confidence: 0.97,
467            reason: "browser-challenge or CAPTCHA markers detected",
468        },
469        ResponseDiagnosticKind::RateLimited => ResponseSignal {
470            class: ResponseClass::RateLimited,
471            confidence: 0.91,
472            reason: "rate-limit markers detected",
473        },
474    }
475}
476
477fn looks_like_aws_waf(status: u16, body_lower: &str) -> bool {
478    // AWS returns HTTP 202 for challenge interstitials and 403 for hard blocks.
479    // The body always references `.awswaf.com` either as a script src or as
480    // a `window.gokuProps` blob.
481    let status_matches = matches!(status, 202 | 403);
482    if !status_matches && status != 200 {
483        return false;
484    }
485    contains_any(
486        body_lower,
487        &[".awswaf.com", "window.gokuprops", "awswafintegration"],
488    )
489}
490
491fn looks_like_vercel_checkpoint(body_lower: &str) -> bool {
492    contains_any(
493        body_lower,
494        &[
495            "vercel security checkpoint",
496            "we're verifying your browser",
497            "we are verifying your browser",
498        ],
499    )
500}
501
502fn looks_like_cloudflare_challenge(body_lower: &str) -> bool {
503    contains_any(
504        body_lower,
505        &[
506            "cf-browser-verification",
507            "cf-chl-",
508            "cf-challenge",
509            "checking your browser before accessing",
510            "just a moment...",
511            "cloudflare ray id",
512        ],
513    )
514}
515
516fn looks_like_turnstile(body_lower: &str) -> bool {
517    contains_any(
518        body_lower,
519        &["cf-turnstile", "turnstile.js", "challenge-platform"],
520    )
521}
522
523fn looks_like_captcha(body_lower: &str) -> bool {
524    contains_any(
525        body_lower,
526        &["g-recaptcha", "grecaptcha", "h-captcha", "hcaptcha"],
527    ) || (body_lower.contains("captcha") && body_lower.contains("<img"))
528}
529
530fn looks_like_captcha_interstitial(body_lower: &str) -> bool {
531    looks_like_captcha(body_lower)
532        && contains_any(
533            body_lower,
534            &[
535                "verify you are human",
536                "are you human",
537                "security check",
538                "browser verification",
539                "checking your browser",
540                "please enable javascript and cookies to continue",
541            ],
542        )
543}
544
545fn looks_like_rate_limit(body_lower: &str) -> bool {
546    contains_any(
547        body_lower,
548        &[
549            "too many requests",
550            "rate limit",
551            "rate-limit",
552            "throttled",
553            "request limit reached",
554        ],
555    )
556}
557
558fn looks_like_forbidden(body_lower: &str) -> bool {
559    contains_any(
560        body_lower,
561        &[
562            "access denied",
563            "forbidden",
564            "permission denied",
565            "not authorized",
566            "not authorised",
567        ],
568    )
569}
570
571fn looks_like_session_expired(body_lower: &str) -> bool {
572    contains_any(
573        body_lower,
574        &[
575            "session expired",
576            "your session has expired",
577            "session timed out",
578            "please sign in again",
579            "please log in again",
580        ],
581    )
582}
583
584fn looks_like_login_wall(body_lower: &str) -> bool {
585    contains_any(
586        body_lower,
587        &[
588            "login required",
589            "log in to continue",
590            "sign in to continue",
591            "authentication required",
592            "please authenticate",
593            "continue with google",
594            "continue with email",
595            "sign in with",
596        ],
597    )
598}
599
600fn looks_like_password_gate(body_lower: &str) -> bool {
601    contains_any(
602        body_lower,
603        &[
604            "type=\"password\"",
605            "autocomplete=\"current-password\"",
606            "name=\"password\"",
607            "id=\"password\"",
608            "enter your password",
609            "forgot password",
610        ],
611    )
612}
613
614fn contains_any(haystack: &str, needles: &[&str]) -> bool {
615    needles.iter().any(|needle| haystack.contains(needle))
616}
617
618fn looks_like_readable_word(token: &str) -> bool {
619    let len = token.chars().count();
620    if !(4..=24).contains(&len) {
621        return false;
622    }
623
624    let alpha_count = token.chars().filter(char::is_ascii_alphabetic).count();
625    alpha_count * 100 / len >= 80
626}
627
628fn base64ish_blob_token_len(token: &str) -> Option<usize> {
629    let len = token.len();
630    if len < 768 {
631        return None;
632    }
633
634    let allowed_count = token
635        .bytes()
636        .filter(|byte| {
637            byte.is_ascii_alphanumeric() || matches!(*byte, b'+' | b'/' | b'=' | b'_' | b'-')
638        })
639        .count();
640    if allowed_count * 100 / len < 98 {
641        return None;
642    }
643
644    let digit_count = token.bytes().filter(u8::is_ascii_digit).count();
645    let alpha_count = token.bytes().filter(u8::is_ascii_alphabetic).count();
646    if digit_count == 0 || alpha_count == 0 {
647        return None;
648    }
649
650    Some(len)
651}
652
653fn is_thin_content(html_len: usize, markdown_len: usize) -> bool {
654    const MIN_HTML_LEN: usize = 5_000;
655    const MIN_MARKDOWN_LEN: usize = 800;
656    const THIN_RATIO_PERCENT: usize = 2;
657
658    if html_len < MIN_HTML_LEN || markdown_len >= MIN_MARKDOWN_LEN {
659        return false;
660    }
661
662    let ratio_percent = (markdown_len * 100) / html_len.max(1);
663    ratio_percent < THIN_RATIO_PERCENT
664}
665
666#[cfg(test)]
667mod tests {
668    use super::{
669        AuthRequiredKind, BrowserChallengeKind, ResponseAnalysis, ResponseClass,
670        ResponseDiagnosticKind, classify_http_response, classify_obfuscated_content,
671        classify_response,
672    };
673
674    #[test]
675    fn classify_http_response_detects_vercel_checkpoint() {
676        let body = "<html><body>Vercel Security Checkpoint</body></html>";
677        let diagnostic = classify_http_response(429, body).expect("vercel classification");
678        assert_eq!(
679            diagnostic.kind,
680            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel)
681        );
682        assert_eq!(diagnostic.code(), "vercel_challenge");
683    }
684
685    #[test]
686    fn classify_http_response_detects_cloudflare_challenge() {
687        let body = "<div id='cf-browser-verification'>Please wait...</div>";
688        let diagnostic = classify_http_response(403, body).expect("cloudflare classification");
689        assert_eq!(
690            diagnostic.kind,
691            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare)
692        );
693    }
694
695    #[test]
696    fn classify_http_response_detects_turnstile_challenge_on_200() {
697        let body = "<div class='cf-turnstile'></div><script src='turnstile.js'></script>";
698        let diagnostic = classify_http_response(200, body).expect("turnstile classification");
699        assert_eq!(
700            diagnostic.kind,
701            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile)
702        );
703    }
704
705    #[test]
706    fn classify_http_response_detects_login_wall_with_password_form() {
707        let body = r#"
708            <html><body>
709              <h1>Sign in to continue</h1>
710              <form><input type="password" name="password"></form>
711            </body></html>
712        "#;
713        let diagnostic = classify_http_response(200, body).expect("login wall classification");
714        assert_eq!(
715            diagnostic.kind,
716            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired)
717        );
718        assert_eq!(diagnostic.code(), "login_required");
719    }
720
721    #[test]
722    fn classify_http_response_detects_session_expired() {
723        let body = "<html><body>Your session has expired. Please sign in again.</body></html>";
724        let diagnostic = classify_http_response(200, body).expect("session expired classification");
725        assert_eq!(
726            diagnostic.kind,
727            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired)
728        );
729    }
730
731    #[test]
732    fn classify_http_response_detects_rate_limit() {
733        let body = "Too many requests. Rate limit reached.";
734        let diagnostic = classify_http_response(429, body).expect("rate limit classification");
735        assert_eq!(diagnostic.kind, ResponseDiagnosticKind::RateLimited);
736    }
737
738    #[test]
739    fn classify_http_response_ignores_normal_html() {
740        let body = "<html><body><article><h1>Hello</h1><p>World</p></article></body></html>";
741        assert!(
742            classify_http_response(200, body).is_none(),
743            "expected no diagnostic for regular article HTML"
744        );
745    }
746
747    #[test]
748    fn classify_response_marks_thin_html_content() {
749        let classification = classify_response(ResponseAnalysis {
750            status: 200,
751            body: "<html></html>",
752            content_type: Some("text/html"),
753            html_bytes: Some(20_000),
754            markdown: Some("short"),
755            markdown_chars: Some(120),
756            quality: None,
757        });
758        assert!(classification.has_class(ResponseClass::ThinContent));
759    }
760
761    #[test]
762    fn classify_response_maps_session_expired_to_unauthorized() {
763        let classification = classify_response(ResponseAnalysis {
764            status: 200,
765            body: "<html><body>Your session has expired. Please sign in again.</body></html>",
766            content_type: Some("text/html"),
767            html_bytes: None,
768            markdown: None,
769            markdown_chars: None,
770            quality: None,
771        });
772        assert_eq!(
773            classification.primary().map(|signal| signal.class),
774            Some(ResponseClass::Unauthorized)
775        );
776    }
777
778    #[test]
779    fn classify_response_maps_http_401_to_unauthorized() {
780        let classification = classify_response(ResponseAnalysis {
781            status: 401,
782            body: "<html><body>Unauthorized</body></html>",
783            content_type: Some("text/html"),
784            html_bytes: None,
785            markdown: None,
786            markdown_chars: None,
787            quality: None,
788        });
789        assert_eq!(
790            classification.primary().map(|signal| signal.class),
791            Some(ResponseClass::Unauthorized)
792        );
793    }
794
795    #[test]
796    fn classify_response_detects_obfuscated_content_blob() {
797        let blob = format!("Title: Protected article\n\n{}", "AbC123+/".repeat(700));
798        let classification = classify_response(ResponseAnalysis {
799            status: 200,
800            body: "<html><body><script>protected payload</script></body></html>",
801            content_type: Some("text/html"),
802            html_bytes: Some(40_000),
803            markdown: Some(&blob),
804            markdown_chars: Some(blob.len()),
805            quality: None,
806        });
807        assert_eq!(
808            classification.primary().map(|signal| signal.class),
809            Some(ResponseClass::ObfuscatedContent)
810        );
811        assert!(
812            classify_obfuscated_content(Some("text/html"), &blob).is_some(),
813            "expected blob classification for encoded markdown"
814        );
815    }
816
817    #[test]
818    fn classify_obfuscated_content_ignores_readable_article_with_one_blob() {
819        let article = [
820            "This article explains a benchmark result in normal prose.",
821            "It includes enough readable words to look like a real article body.",
822            "A single pasted token should not dominate the classification.",
823            &"AbC123+/".repeat(180),
824        ]
825        .join(" ");
826        assert!(
827            classify_obfuscated_content(Some("text/html"), &article).is_none(),
828            "expected readable article to avoid obfuscated classification"
829        );
830    }
831
832    #[test]
833    fn classify_http_response_detects_aws_waf_challenge() {
834        let body = r#"<script src="https://abc.awswaf.com/xyz/challenge.js"></script>"#;
835        let diagnostic = classify_http_response(202, body).expect("aws waf classification");
836        assert_eq!(
837            diagnostic.kind,
838            ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf)
839        );
840        assert_eq!(diagnostic.code(), "aws_waf_challenge");
841    }
842
843    #[test]
844    fn classify_http_response_avoids_login_page_recaptcha_false_positive() {
845        let body = r#"
846            <html><body>
847              <h1>Sign in to continue</h1>
848              <form><input type="password" name="password"></form>
849              <div class="g-recaptcha"></div>
850            </body></html>
851        "#;
852        let diagnostic = classify_http_response(200, body).expect("login wall classification");
853        assert_eq!(
854            diagnostic.kind,
855            ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired)
856        );
857    }
858}
nab/content/response_classifier.rs

nab/content/
response_classifier.rs