1use super::quality::QualityScore;
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum AuthRequiredKind {
9 LoginRequired,
10 SessionExpired,
11}
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum BrowserChallengeKind {
16 Cloudflare,
17 Vercel,
18 Turnstile,
19 Captcha,
20 LinkedInBotDetection,
21 AwsWaf,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum ResponseDiagnosticKind {
29 AuthRequired(AuthRequiredKind),
30 BrowserChallenge(BrowserChallengeKind),
31 RateLimited,
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub struct ResponseDiagnostic {
37 pub kind: ResponseDiagnosticKind,
38 pub status: u16,
39}
40
41impl ResponseDiagnostic {
42 #[must_use]
44 pub fn code(self) -> &'static str {
45 match self.kind {
46 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => {
47 "login_required"
48 }
49 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => {
50 "session_expired"
51 }
52 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare) => {
53 "cloudflare_challenge"
54 }
55 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel) => {
56 "vercel_challenge"
57 }
58 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile) => {
59 "turnstile_challenge"
60 }
61 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha) => {
62 "captcha_challenge"
63 }
64 ResponseDiagnosticKind::BrowserChallenge(
65 BrowserChallengeKind::LinkedInBotDetection,
66 ) => "linkedin_bot_detection",
67 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf) => {
68 "aws_waf_challenge"
69 }
70 ResponseDiagnosticKind::RateLimited => "rate_limited",
71 }
72 }
73
74 #[must_use]
76 pub fn summary(self) -> String {
77 match self.kind {
78 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => format!(
79 "Login wall or authenticated content detected (HTTP {}).",
80 self.status
81 ),
82 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => format!(
83 "Session appears expired or timed out (HTTP {}).",
84 self.status
85 ),
86 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare) => {
87 format!(
88 "Cloudflare browser challenge detected (HTTP {}).",
89 self.status
90 )
91 }
92 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel) => {
93 "Vercel Security Checkpoint detected.".to_string()
94 }
95 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile) => {
96 "Cloudflare Turnstile challenge detected.".to_string()
97 }
98 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha) => {
99 format!("CAPTCHA challenge detected (HTTP {}).", self.status)
100 }
101 ResponseDiagnosticKind::BrowserChallenge(
102 BrowserChallengeKind::LinkedInBotDetection,
103 ) => "LinkedIn bot detection (HTTP 999).".to_string(),
104 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf) => {
105 format!("AWS WAF challenge detected (HTTP {}).", self.status)
106 }
107 ResponseDiagnosticKind::RateLimited => format!(
108 "Rate limit or throttling response detected (HTTP {}).",
109 self.status
110 ),
111 }
112 }
113
114 #[must_use]
116 pub fn guidance(self) -> &'static str {
117 match self.kind {
118 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => {
119 "Sign in in a browser first, then retry with the default browser cookies or a named authenticated session. If you explicitly disabled cookies, re-enable them."
120 }
121 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => {
122 "Refresh the site in a browser to renew the session, then retry with the default browser cookies or a named authenticated session."
123 }
124 ResponseDiagnosticKind::BrowserChallenge(_) => {
125 "Complete the browser challenge in a real browser first, then retry with the default browser cookies or a named session. Use an explicit browser override only if the default profile is not the authenticated one."
126 }
127 ResponseDiagnosticKind::RateLimited => {
128 "Retry later, or use an authenticated browser/session path if the site rate-limits anonymous traffic."
129 }
130 }
131 }
132
133 #[must_use]
135 pub fn message(self) -> String {
136 format!("{}\n{}", self.summary(), self.guidance())
137 }
138}
139
140#[derive(Debug, Clone, Copy, PartialEq, Eq)]
142pub enum ResponseClass {
143 Unauthorized,
144 LoginRequired,
145 Forbidden,
146 BotChallenge,
147 RateLimited,
148 ObfuscatedContent,
149 ThinContent,
150}
151
152impl ResponseClass {
153 #[must_use]
155 pub fn code(self) -> &'static str {
156 match self {
157 Self::Unauthorized => "unauthorized",
158 Self::LoginRequired => "login_required",
159 Self::Forbidden => "forbidden",
160 Self::BotChallenge => "bot_challenge",
161 Self::RateLimited => "rate_limited",
162 Self::ObfuscatedContent => "obfuscated_content",
163 Self::ThinContent => "thin_content",
164 }
165 }
166}
167
168#[derive(Debug, Clone, Copy, PartialEq)]
170pub struct ResponseSignal {
171 pub class: ResponseClass,
172 pub confidence: f32,
173 pub reason: &'static str,
174}
175
176#[derive(Debug, Clone, Default, PartialEq)]
178pub struct ResponseClassification {
179 signals: Vec<ResponseSignal>,
180}
181
182impl ResponseClassification {
183 fn push(&mut self, signal: ResponseSignal) {
184 if !self.has_class(signal.class) {
185 self.signals.push(signal);
186 }
187 }
188
189 #[must_use]
191 pub fn primary(&self) -> Option<&ResponseSignal> {
192 self.signals.first()
193 }
194
195 #[must_use]
197 pub fn has_class(&self, class: ResponseClass) -> bool {
198 self.signals.iter().any(|signal| signal.class == class)
199 }
200}
201
202#[derive(Debug, Clone, Copy)]
204pub struct ResponseAnalysis<'a> {
205 pub status: u16,
206 pub body: &'a str,
207 pub content_type: Option<&'a str>,
208 pub html_bytes: Option<usize>,
209 pub markdown: Option<&'a str>,
210 pub markdown_chars: Option<usize>,
211 pub quality: Option<&'a QualityScore>,
212}
213
214#[must_use]
216pub fn classify_http_response(status: u16, body: &str) -> Option<ResponseDiagnostic> {
217 let body_lower = body.to_lowercase();
218 classify_http_response_lower(status, &body_lower)
219}
220
221#[must_use]
223pub fn classify_response(analysis: ResponseAnalysis<'_>) -> ResponseClassification {
224 let body_lower = analysis.body.to_lowercase();
225 let mut classification = ResponseClassification::default();
226
227 if analysis.status == 401 {
228 classification.push(ResponseSignal {
229 class: ResponseClass::Unauthorized,
230 confidence: 0.97,
231 reason: "http 401 unauthorized response",
232 });
233 } else if let Some(diagnostic) = classify_http_response_lower(analysis.status, &body_lower) {
234 classification.push(map_diagnostic_signal(diagnostic));
235 } else if matches!(analysis.status, 403 | 999) && looks_like_forbidden(&body_lower) {
236 classification.push(ResponseSignal {
237 class: ResponseClass::Forbidden,
238 confidence: if analysis.status == 999 { 0.96 } else { 0.85 },
239 reason: if analysis.status == 999 {
240 "nonstandard anti-automation block status detected"
241 } else {
242 "forbidden or access-denied markers detected"
243 },
244 });
245 }
246
247 if let (Some(html_bytes), Some(markdown_chars)) = (analysis.html_bytes, analysis.markdown_chars)
248 && classify_thin_content(
249 analysis.content_type,
250 html_bytes,
251 markdown_chars,
252 analysis.quality,
253 )
254 .is_some()
255 {
256 let confidence = analysis.quality.map_or(0.78_f32, |quality| {
257 if quality.confidence < 0.5 {
258 0.9_f32
259 } else {
260 0.8_f32
261 }
262 });
263 classification.push(ResponseSignal {
264 class: ResponseClass::ThinContent,
265 confidence,
266 reason: "markdown output is disproportionately small relative to the HTML body",
267 });
268 }
269
270 if let Some(markdown) = analysis.markdown
271 && classify_obfuscated_content(analysis.content_type, markdown).is_some()
272 {
273 classification.push(ResponseSignal {
274 class: ResponseClass::ObfuscatedContent,
275 confidence: 0.95,
276 reason: "extracted content is dominated by a long encoded or obfuscated blob",
277 });
278 }
279
280 classification
281}
282
283#[derive(Debug, Clone, Copy, PartialEq, Eq)]
285pub struct ThinContentDiagnostic {
286 pub html_bytes: usize,
287 pub markdown_chars: usize,
288 pub low_confidence: bool,
289}
290
291#[must_use]
294pub fn classify_thin_content(
295 content_type: Option<&str>,
296 html_bytes: usize,
297 markdown_chars: usize,
298 quality: Option<&QualityScore>,
299) -> Option<ThinContentDiagnostic> {
300 let is_html = content_type.is_some_and(|value| value.contains("html"));
301 if !is_html {
302 return None;
303 }
304
305 if is_thin_content(html_bytes, markdown_chars) {
306 return Some(ThinContentDiagnostic {
307 html_bytes,
308 markdown_chars,
309 low_confidence: quality.is_some_and(|score| score.confidence < 0.5),
310 });
311 }
312
313 if html_bytes >= 5_000
314 && markdown_chars < 800
315 && quality.is_some_and(|score| score.confidence < 0.35)
316 {
317 return Some(ThinContentDiagnostic {
318 html_bytes,
319 markdown_chars,
320 low_confidence: true,
321 });
322 }
323
324 None
325}
326
327#[derive(Debug, Clone, Copy, PartialEq, Eq)]
329pub struct ObfuscatedContentDiagnostic {
330 pub dominant_blob_chars: usize,
331 pub non_whitespace_chars: usize,
332 pub readable_word_count: usize,
333}
334
335#[must_use]
339pub fn classify_obfuscated_content(
340 content_type: Option<&str>,
341 markdown: &str,
342) -> Option<ObfuscatedContentDiagnostic> {
343 let is_html = content_type.is_some_and(|value| value.contains("html"));
344 if !is_html {
345 return None;
346 }
347
348 let non_whitespace_chars = markdown.chars().filter(|c| !c.is_whitespace()).count();
349 if non_whitespace_chars < 2_048 {
350 return None;
351 }
352
353 let readable_word_count = markdown
354 .split_whitespace()
355 .filter(|token| looks_like_readable_word(token))
356 .take(32)
357 .count();
358 if readable_word_count >= 24 {
359 return None;
360 }
361
362 let dominant_blob_chars = markdown
363 .split_whitespace()
364 .filter_map(base64ish_blob_token_len)
365 .max()
366 .unwrap_or(0);
367 let dominant_ratio = (dominant_blob_chars * 100) / non_whitespace_chars.max(1);
368
369 if dominant_blob_chars >= 2_048 || (dominant_blob_chars >= 1_024 && dominant_ratio >= 60) {
370 return Some(ObfuscatedContentDiagnostic {
371 dominant_blob_chars,
372 non_whitespace_chars,
373 readable_word_count,
374 });
375 }
376
377 None
378}
379
380fn classify_http_response_lower(status: u16, body_lower: &str) -> Option<ResponseDiagnostic> {
381 if status == 999 {
382 return Some(ResponseDiagnostic {
383 kind: ResponseDiagnosticKind::BrowserChallenge(
384 BrowserChallengeKind::LinkedInBotDetection,
385 ),
386 status,
387 });
388 }
389
390 if looks_like_aws_waf(status, body_lower) {
391 return Some(ResponseDiagnostic {
392 kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf),
393 status,
394 });
395 }
396
397 if looks_like_turnstile(body_lower) {
398 return Some(ResponseDiagnostic {
399 kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile),
400 status,
401 });
402 }
403
404 if status == 429 && looks_like_vercel_checkpoint(body_lower) {
405 return Some(ResponseDiagnostic {
406 kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel),
407 status,
408 });
409 }
410
411 if matches!(status, 403 | 503) && looks_like_cloudflare_challenge(body_lower) {
412 return Some(ResponseDiagnostic {
413 kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare),
414 status,
415 });
416 }
417
418 if matches!(status, 403 | 429 | 503) && looks_like_captcha_interstitial(body_lower) {
419 return Some(ResponseDiagnostic {
420 kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha),
421 status,
422 });
423 }
424
425 if matches!(status, 419 | 440) || looks_like_session_expired(body_lower) {
426 return Some(ResponseDiagnostic {
427 kind: ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired),
428 status,
429 });
430 }
431
432 if status == 429 && looks_like_rate_limit(body_lower) {
433 return Some(ResponseDiagnostic {
434 kind: ResponseDiagnosticKind::RateLimited,
435 status,
436 });
437 }
438
439 if (status == 403
440 && (looks_like_login_wall(body_lower) || looks_like_password_gate(body_lower)))
441 || (looks_like_login_wall(body_lower) && looks_like_password_gate(body_lower))
442 {
443 return Some(ResponseDiagnostic {
444 kind: ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired),
445 status,
446 });
447 }
448
449 None
450}
451
452fn map_diagnostic_signal(diagnostic: ResponseDiagnostic) -> ResponseSignal {
453 match diagnostic.kind {
454 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => ResponseSignal {
455 class: ResponseClass::LoginRequired,
456 confidence: if diagnostic.status == 200 { 0.83 } else { 0.95 },
457 reason: "login-wall markers and password-gate signals detected",
458 },
459 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => ResponseSignal {
460 class: ResponseClass::Unauthorized,
461 confidence: 0.94,
462 reason: "session-expired markers detected",
463 },
464 ResponseDiagnosticKind::BrowserChallenge(_) => ResponseSignal {
465 class: ResponseClass::BotChallenge,
466 confidence: 0.97,
467 reason: "browser-challenge or CAPTCHA markers detected",
468 },
469 ResponseDiagnosticKind::RateLimited => ResponseSignal {
470 class: ResponseClass::RateLimited,
471 confidence: 0.91,
472 reason: "rate-limit markers detected",
473 },
474 }
475}
476
477fn looks_like_aws_waf(status: u16, body_lower: &str) -> bool {
478 let status_matches = matches!(status, 202 | 403);
482 if !status_matches && status != 200 {
483 return false;
484 }
485 contains_any(
486 body_lower,
487 &[".awswaf.com", "window.gokuprops", "awswafintegration"],
488 )
489}
490
491fn looks_like_vercel_checkpoint(body_lower: &str) -> bool {
492 contains_any(
493 body_lower,
494 &[
495 "vercel security checkpoint",
496 "we're verifying your browser",
497 "we are verifying your browser",
498 ],
499 )
500}
501
502fn looks_like_cloudflare_challenge(body_lower: &str) -> bool {
503 contains_any(
504 body_lower,
505 &[
506 "cf-browser-verification",
507 "cf-chl-",
508 "cf-challenge",
509 "checking your browser before accessing",
510 "just a moment...",
511 "cloudflare ray id",
512 ],
513 )
514}
515
516fn looks_like_turnstile(body_lower: &str) -> bool {
517 contains_any(
518 body_lower,
519 &["cf-turnstile", "turnstile.js", "challenge-platform"],
520 )
521}
522
523fn looks_like_captcha(body_lower: &str) -> bool {
524 contains_any(
525 body_lower,
526 &["g-recaptcha", "grecaptcha", "h-captcha", "hcaptcha"],
527 ) || (body_lower.contains("captcha") && body_lower.contains("<img"))
528}
529
530fn looks_like_captcha_interstitial(body_lower: &str) -> bool {
531 looks_like_captcha(body_lower)
532 && contains_any(
533 body_lower,
534 &[
535 "verify you are human",
536 "are you human",
537 "security check",
538 "browser verification",
539 "checking your browser",
540 "please enable javascript and cookies to continue",
541 ],
542 )
543}
544
545fn looks_like_rate_limit(body_lower: &str) -> bool {
546 contains_any(
547 body_lower,
548 &[
549 "too many requests",
550 "rate limit",
551 "rate-limit",
552 "throttled",
553 "request limit reached",
554 ],
555 )
556}
557
558fn looks_like_forbidden(body_lower: &str) -> bool {
559 contains_any(
560 body_lower,
561 &[
562 "access denied",
563 "forbidden",
564 "permission denied",
565 "not authorized",
566 "not authorised",
567 ],
568 )
569}
570
571fn looks_like_session_expired(body_lower: &str) -> bool {
572 contains_any(
573 body_lower,
574 &[
575 "session expired",
576 "your session has expired",
577 "session timed out",
578 "please sign in again",
579 "please log in again",
580 ],
581 )
582}
583
584fn looks_like_login_wall(body_lower: &str) -> bool {
585 contains_any(
586 body_lower,
587 &[
588 "login required",
589 "log in to continue",
590 "sign in to continue",
591 "authentication required",
592 "please authenticate",
593 "continue with google",
594 "continue with email",
595 "sign in with",
596 ],
597 )
598}
599
600fn looks_like_password_gate(body_lower: &str) -> bool {
601 contains_any(
602 body_lower,
603 &[
604 "type=\"password\"",
605 "autocomplete=\"current-password\"",
606 "name=\"password\"",
607 "id=\"password\"",
608 "enter your password",
609 "forgot password",
610 ],
611 )
612}
613
614fn contains_any(haystack: &str, needles: &[&str]) -> bool {
615 needles.iter().any(|needle| haystack.contains(needle))
616}
617
618fn looks_like_readable_word(token: &str) -> bool {
619 let len = token.chars().count();
620 if !(4..=24).contains(&len) {
621 return false;
622 }
623
624 let alpha_count = token.chars().filter(char::is_ascii_alphabetic).count();
625 alpha_count * 100 / len >= 80
626}
627
628fn base64ish_blob_token_len(token: &str) -> Option<usize> {
629 let len = token.len();
630 if len < 768 {
631 return None;
632 }
633
634 let allowed_count = token
635 .bytes()
636 .filter(|byte| {
637 byte.is_ascii_alphanumeric() || matches!(*byte, b'+' | b'/' | b'=' | b'_' | b'-')
638 })
639 .count();
640 if allowed_count * 100 / len < 98 {
641 return None;
642 }
643
644 let digit_count = token.bytes().filter(u8::is_ascii_digit).count();
645 let alpha_count = token.bytes().filter(u8::is_ascii_alphabetic).count();
646 if digit_count == 0 || alpha_count == 0 {
647 return None;
648 }
649
650 Some(len)
651}
652
653fn is_thin_content(html_len: usize, markdown_len: usize) -> bool {
654 const MIN_HTML_LEN: usize = 5_000;
655 const MIN_MARKDOWN_LEN: usize = 800;
656 const THIN_RATIO_PERCENT: usize = 2;
657
658 if html_len < MIN_HTML_LEN || markdown_len >= MIN_MARKDOWN_LEN {
659 return false;
660 }
661
662 let ratio_percent = (markdown_len * 100) / html_len.max(1);
663 ratio_percent < THIN_RATIO_PERCENT
664}
665
666#[cfg(test)]
667mod tests {
668 use super::{
669 AuthRequiredKind, BrowserChallengeKind, ResponseAnalysis, ResponseClass,
670 ResponseDiagnosticKind, classify_http_response, classify_obfuscated_content,
671 classify_response,
672 };
673
674 #[test]
675 fn classify_http_response_detects_vercel_checkpoint() {
676 let body = "<html><body>Vercel Security Checkpoint</body></html>";
677 let diagnostic = classify_http_response(429, body).expect("vercel classification");
678 assert_eq!(
679 diagnostic.kind,
680 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel)
681 );
682 assert_eq!(diagnostic.code(), "vercel_challenge");
683 }
684
685 #[test]
686 fn classify_http_response_detects_cloudflare_challenge() {
687 let body = "<div id='cf-browser-verification'>Please wait...</div>";
688 let diagnostic = classify_http_response(403, body).expect("cloudflare classification");
689 assert_eq!(
690 diagnostic.kind,
691 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare)
692 );
693 }
694
695 #[test]
696 fn classify_http_response_detects_turnstile_challenge_on_200() {
697 let body = "<div class='cf-turnstile'></div><script src='turnstile.js'></script>";
698 let diagnostic = classify_http_response(200, body).expect("turnstile classification");
699 assert_eq!(
700 diagnostic.kind,
701 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile)
702 );
703 }
704
705 #[test]
706 fn classify_http_response_detects_login_wall_with_password_form() {
707 let body = r#"
708 <html><body>
709 <h1>Sign in to continue</h1>
710 <form><input type="password" name="password"></form>
711 </body></html>
712 "#;
713 let diagnostic = classify_http_response(200, body).expect("login wall classification");
714 assert_eq!(
715 diagnostic.kind,
716 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired)
717 );
718 assert_eq!(diagnostic.code(), "login_required");
719 }
720
721 #[test]
722 fn classify_http_response_detects_session_expired() {
723 let body = "<html><body>Your session has expired. Please sign in again.</body></html>";
724 let diagnostic = classify_http_response(200, body).expect("session expired classification");
725 assert_eq!(
726 diagnostic.kind,
727 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired)
728 );
729 }
730
731 #[test]
732 fn classify_http_response_detects_rate_limit() {
733 let body = "Too many requests. Rate limit reached.";
734 let diagnostic = classify_http_response(429, body).expect("rate limit classification");
735 assert_eq!(diagnostic.kind, ResponseDiagnosticKind::RateLimited);
736 }
737
738 #[test]
739 fn classify_http_response_ignores_normal_html() {
740 let body = "<html><body><article><h1>Hello</h1><p>World</p></article></body></html>";
741 assert!(
742 classify_http_response(200, body).is_none(),
743 "expected no diagnostic for regular article HTML"
744 );
745 }
746
747 #[test]
748 fn classify_response_marks_thin_html_content() {
749 let classification = classify_response(ResponseAnalysis {
750 status: 200,
751 body: "<html></html>",
752 content_type: Some("text/html"),
753 html_bytes: Some(20_000),
754 markdown: Some("short"),
755 markdown_chars: Some(120),
756 quality: None,
757 });
758 assert!(classification.has_class(ResponseClass::ThinContent));
759 }
760
761 #[test]
762 fn classify_response_maps_session_expired_to_unauthorized() {
763 let classification = classify_response(ResponseAnalysis {
764 status: 200,
765 body: "<html><body>Your session has expired. Please sign in again.</body></html>",
766 content_type: Some("text/html"),
767 html_bytes: None,
768 markdown: None,
769 markdown_chars: None,
770 quality: None,
771 });
772 assert_eq!(
773 classification.primary().map(|signal| signal.class),
774 Some(ResponseClass::Unauthorized)
775 );
776 }
777
778 #[test]
779 fn classify_response_maps_http_401_to_unauthorized() {
780 let classification = classify_response(ResponseAnalysis {
781 status: 401,
782 body: "<html><body>Unauthorized</body></html>",
783 content_type: Some("text/html"),
784 html_bytes: None,
785 markdown: None,
786 markdown_chars: None,
787 quality: None,
788 });
789 assert_eq!(
790 classification.primary().map(|signal| signal.class),
791 Some(ResponseClass::Unauthorized)
792 );
793 }
794
795 #[test]
796 fn classify_response_detects_obfuscated_content_blob() {
797 let blob = format!("Title: Protected article\n\n{}", "AbC123+/".repeat(700));
798 let classification = classify_response(ResponseAnalysis {
799 status: 200,
800 body: "<html><body><script>protected payload</script></body></html>",
801 content_type: Some("text/html"),
802 html_bytes: Some(40_000),
803 markdown: Some(&blob),
804 markdown_chars: Some(blob.len()),
805 quality: None,
806 });
807 assert_eq!(
808 classification.primary().map(|signal| signal.class),
809 Some(ResponseClass::ObfuscatedContent)
810 );
811 assert!(
812 classify_obfuscated_content(Some("text/html"), &blob).is_some(),
813 "expected blob classification for encoded markdown"
814 );
815 }
816
817 #[test]
818 fn classify_obfuscated_content_ignores_readable_article_with_one_blob() {
819 let article = [
820 "This article explains a benchmark result in normal prose.",
821 "It includes enough readable words to look like a real article body.",
822 "A single pasted token should not dominate the classification.",
823 &"AbC123+/".repeat(180),
824 ]
825 .join(" ");
826 assert!(
827 classify_obfuscated_content(Some("text/html"), &article).is_none(),
828 "expected readable article to avoid obfuscated classification"
829 );
830 }
831
832 #[test]
833 fn classify_http_response_detects_aws_waf_challenge() {
834 let body = r#"<script src="https://abc.awswaf.com/xyz/challenge.js"></script>"#;
835 let diagnostic = classify_http_response(202, body).expect("aws waf classification");
836 assert_eq!(
837 diagnostic.kind,
838 ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::AwsWaf)
839 );
840 assert_eq!(diagnostic.code(), "aws_waf_challenge");
841 }
842
843 #[test]
844 fn classify_http_response_avoids_login_page_recaptcha_false_positive() {
845 let body = r#"
846 <html><body>
847 <h1>Sign in to continue</h1>
848 <form><input type="password" name="password"></form>
849 <div class="g-recaptcha"></div>
850 </body></html>
851 "#;
852 let diagnostic = classify_http_response(200, body).expect("login wall classification");
853 assert_eq!(
854 diagnostic.kind,
855 ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired)
856 );
857 }
858}