Skip to main content

imp_core/tools/web/
read.rs

1//! Native page reading — fetch HTML via reqwest + extract with readability.
2//!
3//! No external APIs needed for reading pages. Handles most static and
4//! server-rendered pages. Won't work for heavy SPAs that require JS execution.
5
6use reqwest::Client;
7use url::Url;
8
9use super::types::{ContentFormat, ExtractionQuality, PageContent};
10
11/// User-Agent string that identifies as a legitimate browser to avoid blocks.
12pub(crate) const USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
13    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
14pub(crate) const ACCEPT_HEADER: &str =
15    "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5";
16const MAX_RESPONSE_BYTES: u64 = 5 * 1024 * 1024;
17
18/// Fetch a URL and extract its readable content.
19pub async fn fetch_and_extract(client: &Client, url: &str) -> Result<PageContent, ReadError> {
20    let parsed_url = validate_url(url)?;
21
22    if super::youtube::is_youtube_url(&parsed_url) {
23        return super::youtube::fetch_and_extract(client, url)
24            .await
25            .map_err(|err| ReadError::Youtube(err.to_string()));
26    }
27
28    let requested_url = url.to_string();
29
30    let response = client
31        .get(url)
32        .header("User-Agent", USER_AGENT)
33        .header("Accept", ACCEPT_HEADER)
34        .header("Accept-Language", "en-US,en;q=0.9")
35        .send()
36        .await
37        .map_err(|e| ReadError::Fetch(e.to_string()))?;
38
39    let status_code = response.status().as_u16();
40    if !response.status().is_success() {
41        return Err(ReadError::HttpStatus(
42            status_code,
43            response
44                .status()
45                .canonical_reason()
46                .unwrap_or("Unknown")
47                .to_string(),
48        ));
49    }
50
51    let content_type = response
52        .headers()
53        .get("content-type")
54        .and_then(|v| v.to_str().ok())
55        .unwrap_or("")
56        .to_string();
57
58    let format_received = detect_content_format(&content_type);
59
60    // Reject binary content types (images, video, audio, etc.)
61    let is_text = content_type.is_empty()
62        || content_type.contains("text/")
63        || content_type.contains("application/json")
64        || content_type.contains("application/xml")
65        || content_type.contains("application/xhtml")
66        || content_type.contains("application/javascript")
67        || content_type.contains("+xml")
68        || content_type.contains("+json");
69    if !is_text {
70        return Err(ReadError::NotHtml(content_type));
71    }
72
73    let final_url = response.url().to_string();
74    validate_url(&final_url)?;
75    let was_redirected = final_url != requested_url;
76    if let Some(content_length) = response.content_length() {
77        if content_length > MAX_RESPONSE_BYTES {
78            return Err(ReadError::ResponseTooLarge(content_length));
79        }
80    }
81    let bytes = response
82        .bytes()
83        .await
84        .map_err(|e| ReadError::Fetch(e.to_string()))?;
85    if bytes.len() as u64 > MAX_RESPONSE_BYTES {
86        return Err(ReadError::ResponseTooLarge(bytes.len() as u64));
87    }
88    let raw_body_bytes = bytes.len();
89    let html = String::from_utf8_lossy(&bytes).into_owned();
90
91    if html.len() < 100 {
92        return Err(ReadError::InsufficientContent);
93    }
94
95    // Shared metadata for all paths
96    let meta = ResponseMeta {
97        requested_url,
98        status_code,
99        content_type: if content_type.is_empty() {
100            None
101        } else {
102            Some(content_type.clone())
103        },
104        format_received,
105        was_redirected,
106        raw_body_bytes,
107    };
108
109    match format_received {
110        ContentFormat::Markdown | ContentFormat::PlainText => {
111            let cleaned = clean_text(&html);
112            let mut page = PageContent {
113                title: None,
114                content_length: cleaned.len(),
115                text: cleaned,
116                url: final_url,
117                requested_url: meta.requested_url,
118                status_code: meta.status_code,
119                content_type: meta.content_type,
120                format_received: meta.format_received,
121                was_redirected: meta.was_redirected,
122                raw_body_bytes: meta.raw_body_bytes,
123                diagnostics: Vec::new(),
124                quality: ExtractionQuality::Good,
125                quality_reasons: Vec::new(),
126            };
127            page.diagnostics = diagnose(&page, "");
128            apply_quality(&mut page);
129            Ok(page)
130        }
131        ContentFormat::Html => {
132            let mut page = extract_readable(&html, &final_url)?;
133            page.requested_url = meta.requested_url;
134            page.status_code = meta.status_code;
135            page.content_type = meta.content_type;
136            page.format_received = meta.format_received;
137            page.was_redirected = meta.was_redirected;
138            page.raw_body_bytes = meta.raw_body_bytes;
139            page.diagnostics = diagnose(&page, &html);
140            apply_quality(&mut page);
141            Ok(page)
142        }
143    }
144}
145
146/// Metadata captured from the HTTP response before extraction.
147struct ResponseMeta {
148    requested_url: String,
149    status_code: u16,
150    content_type: Option<String>,
151    format_received: ContentFormat,
152    was_redirected: bool,
153    raw_body_bytes: usize,
154}
155
156/// Extract readable content from raw HTML using Mozilla Readability algorithm.
157fn extract_readable(html: &str, url: &str) -> Result<PageContent, ReadError> {
158    use readability_rust::Readability;
159
160    let mut parser = Readability::new_with_base_uri(html, url, None)
161        .map_err(|e| ReadError::Parse(format!("{e}")))?;
162
163    let article = parser.parse().ok_or(ReadError::NoContent)?;
164
165    let title = article.title.clone();
166
167    // article.text_content is the cleaned plain text
168    // article.content is HTML — we convert to plain text ourselves for safety
169    let text = article
170        .text_content
171        .as_deref()
172        .or(article.content.as_deref())
173        .unwrap_or("")
174        .to_string();
175
176    if text.len() < 50 {
177        return Err(ReadError::InsufficientContent);
178    }
179
180    Ok(PageContent {
181        content_length: text.len(),
182        title,
183        text: clean_text(&text),
184        url: url.to_string(),
185        // Populated by caller (fetch_and_extract) after extraction
186        requested_url: url.to_string(),
187        status_code: 200,
188        content_type: None,
189        format_received: ContentFormat::Html,
190        was_redirected: false,
191        raw_body_bytes: 0,
192        diagnostics: Vec::new(),
193        quality: ExtractionQuality::Good,
194        quality_reasons: Vec::new(),
195    })
196}
197
198fn validate_url(url: &str) -> Result<Url, ReadError> {
199    let parsed = Url::parse(url).map_err(|e| ReadError::InvalidUrl(e.to_string()))?;
200    match parsed.scheme() {
201        "http" | "https" => {}
202        scheme => {
203            return Err(ReadError::UnsafeUrl(format!(
204                "unsupported URL scheme: {scheme}"
205            )));
206        }
207    }
208
209    let Some(host) = parsed.host_str() else {
210        return Err(ReadError::UnsafeUrl("missing URL host".to_string()));
211    };
212    let host = host.trim_end_matches('.').to_ascii_lowercase();
213    if matches!(host.as_str(), "localhost" | "metadata.google.internal") {
214        return Err(ReadError::UnsafeUrl(format!("blocked host: {host}")));
215    }
216    if host.ends_with(".localhost") || host.ends_with(".local") {
217        return Err(ReadError::UnsafeUrl(format!("blocked local host: {host}")));
218    }
219    if let Ok(ip) = host.parse::<std::net::IpAddr>() {
220        if is_blocked_ip(ip) {
221            return Err(ReadError::UnsafeUrl(format!(
222                "blocked private address: {ip}"
223            )));
224        }
225    } else if let Some(ip) = parsed.host().and_then(|host| match host {
226        url::Host::Ipv4(ip) => Some(std::net::IpAddr::V4(ip)),
227        url::Host::Ipv6(ip) => Some(std::net::IpAddr::V6(ip)),
228        url::Host::Domain(_) => None,
229    }) {
230        if is_blocked_ip(ip) {
231            return Err(ReadError::UnsafeUrl(format!(
232                "blocked private address: {ip}"
233            )));
234        }
235    }
236
237    Ok(parsed)
238}
239
240fn is_blocked_ip(ip: std::net::IpAddr) -> bool {
241    match ip {
242        std::net::IpAddr::V4(ip) => {
243            ip.is_private()
244                || ip.is_loopback()
245                || ip.is_link_local()
246                || ip.is_broadcast()
247                || is_documentation_ipv4(ip)
248                || ip.is_unspecified()
249                || ip.octets()[0] == 0
250                || ip.octets()[0] >= 224
251                || ip == std::net::Ipv4Addr::new(169, 254, 169, 254)
252        }
253        std::net::IpAddr::V6(ip) => {
254            ip.is_loopback()
255                || ip.is_unspecified()
256                || ip.is_unique_local()
257                || ip.is_unicast_link_local()
258                || is_documentation_ipv6(ip)
259        }
260    }
261}
262
263fn is_documentation_ipv4(ip: std::net::Ipv4Addr) -> bool {
264    let octets = ip.octets();
265    octets[0] == 192 && octets[1] == 0 && octets[2] == 2
266        || octets[0] == 198 && octets[1] == 51 && octets[2] == 100
267        || octets[0] == 203 && octets[1] == 0 && octets[2] == 113
268}
269
270fn is_documentation_ipv6(ip: std::net::Ipv6Addr) -> bool {
271    ip.segments()[0] == 0x2001 && ip.segments()[1] == 0x0db8
272}
273
274fn apply_quality(page: &mut PageContent) {
275    let mut reasons = Vec::new();
276    if page.content_length < 300 {
277        reasons.push("short_content".to_string());
278    }
279    if !page.diagnostics.is_empty() {
280        reasons.push("diagnostics".to_string());
281    }
282    if page.raw_body_bytes > 100 * 1024
283        && (page.content_length as f64) < (page.raw_body_bytes as f64 * 0.1)
284    {
285        reasons.push("low_extraction_ratio".to_string());
286    }
287
288    page.quality = if reasons
289        .iter()
290        .any(|reason| reason == "low_extraction_ratio")
291        || reasons.len() >= 2
292    {
293        ExtractionQuality::Poor
294    } else if reasons.is_empty() {
295        ExtractionQuality::Good
296    } else {
297        ExtractionQuality::Partial
298    };
299    page.quality_reasons = reasons;
300}
301
302pub fn diagnose(page: &PageContent, raw_html: &str) -> Vec<String> {
303    let mut warnings = Vec::new();
304    let text_lower = page.text.to_lowercase();
305    let html_lower = raw_html.to_lowercase();
306
307    let short_text = page.content_length < 500;
308    let has_loading_indicator = ["loading...", "loading documentation"]
309        .iter()
310        .any(|needle| text_lower.contains(needle));
311    let has_noscript = html_lower.contains("<noscript");
312    let nav_link_count = html_lower.matches("<nav").count()
313        + html_lower.matches("<a ").count()
314        + html_lower.matches("<a>").count();
315    let has_nav_shell_pattern = short_text && nav_link_count >= 8;
316    if short_text && (has_loading_indicator || has_noscript || has_nav_shell_pattern) {
317        warnings.push(
318            "Page appears to be a client-rendered shell. Content may require JavaScript."
319                .to_string(),
320        );
321    }
322
323    let very_short_text = page.content_length < 300;
324    let has_soft_404_indicator = [
325        "page not found",
326        "can't find that page",
327        "404",
328        "doesn't exist",
329        "has been moved",
330    ]
331    .iter()
332    .any(|needle| text_lower.contains(needle));
333    if page.status_code == 200 && very_short_text && has_soft_404_indicator {
334        warnings
335            .push("Page appears to be a soft 404 (HTTP 200 but error page content).".to_string());
336    }
337
338    if page.raw_body_bytes > 20 * 1024 && page.content_length < 2 * 1024 {
339        warnings.push(format!(
340            "Large page ({} bytes) but only {} chars extracted. Content may be incomplete.",
341            page.raw_body_bytes, page.content_length
342        ));
343    }
344
345    if page.raw_body_bytes > 100 * 1024
346        && (page.content_length as f64) < (page.raw_body_bytes as f64 * 0.1)
347    {
348        let pct = ((page.content_length as f64 / page.raw_body_bytes as f64) * 100.0).round();
349        warnings.push(format!(
350            "Significant content may have been lost during extraction ({}% of response retained).",
351            pct as usize
352        ));
353    }
354
355    warnings
356}
357
358/// Clean extracted text: normalize whitespace, remove excessive blank lines.
359fn clean_text(text: &str) -> String {
360    let mut result = String::with_capacity(text.len());
361    let mut blank_count = 0u32;
362
363    for line in text.lines() {
364        let trimmed = line.trim();
365        if trimmed.is_empty() {
366            blank_count += 1;
367            if blank_count <= 2 {
368                result.push('\n');
369            }
370        } else {
371            blank_count = 0;
372            result.push_str(trimmed);
373            result.push('\n');
374        }
375    }
376
377    result.trim().to_string()
378}
379
380fn detect_content_format(content_type: &str) -> ContentFormat {
381    let content_type = content_type.to_ascii_lowercase();
382
383    if content_type.contains("text/markdown") || content_type.contains("text/x-markdown") {
384        ContentFormat::Markdown
385    } else if content_type.contains("text/html") || content_type.contains("application/xhtml+xml") {
386        ContentFormat::Html
387    } else {
388        ContentFormat::PlainText
389    }
390}
391
392#[derive(Debug)]
393pub enum ReadError {
394    InvalidUrl(String),
395    UnsafeUrl(String),
396    Fetch(String),
397    HttpStatus(u16, String),
398    NotHtml(String),
399    Parse(String),
400    NoContent,
401    InsufficientContent,
402    ResponseTooLarge(u64),
403    Youtube(String),
404}
405
406impl std::fmt::Display for ReadError {
407    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
408        match self {
409            Self::InvalidUrl(msg) => write!(f, "Invalid URL: {msg}"),
410            Self::UnsafeUrl(msg) => write!(f, "Unsafe URL: {msg}"),
411            Self::Fetch(msg) => write!(f, "Fetch failed: {msg}"),
412            Self::HttpStatus(code, reason) => write!(f, "HTTP {code} {reason}"),
413            Self::NotHtml(ct) => write!(f, "Not an HTML page (content-type: {ct})"),
414            Self::Parse(msg) => write!(f, "Parse error: {msg}"),
415            Self::NoContent => write!(f, "Could not extract readable content from page"),
416            Self::InsufficientContent => write!(f, "Page returned insufficient content"),
417            Self::ResponseTooLarge(bytes) => write!(
418                f,
419                "Response too large: {bytes} bytes exceeds {} byte limit",
420                MAX_RESPONSE_BYTES
421            ),
422            Self::Youtube(msg) => write!(f, "YouTube extraction failed: {msg}"),
423        }
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn accept_header_prefers_markdown() {
433        assert_eq!(
434            ACCEPT_HEADER,
435            "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5"
436        );
437    }
438
439    #[test]
440    fn validate_url_rejects_unsafe_targets() {
441        for url in [
442            "file:///etc/passwd",
443            "http://localhost:3000",
444            "https://service.local/path",
445            "http://127.0.0.1",
446            "http://10.0.0.1",
447            "http://169.254.169.254/latest/meta-data",
448            "http://[::1]/",
449        ] {
450            let result = validate_url(url);
451            assert!(
452                matches!(result, Err(ReadError::UnsafeUrl(_))),
453                "expected unsafe URL error for {url}, got {result:?}"
454            );
455        }
456    }
457
458    #[test]
459    fn validate_url_allows_public_http_urls() {
460        assert!(validate_url("https://example.com/path").is_ok());
461        assert!(validate_url("http://93.184.216.34/").is_ok());
462    }
463
464    #[test]
465    fn quality_marks_low_extraction_ratio_as_poor() {
466        let mut page = PageContent {
467            title: Some("Big Page".to_string()),
468            text: "short".to_string(),
469            url: "https://example.com/big".to_string(),
470            content_length: 5,
471            requested_url: "https://example.com/big".to_string(),
472            status_code: 200,
473            content_type: Some("text/html".to_string()),
474            format_received: ContentFormat::Html,
475            was_redirected: false,
476            raw_body_bytes: 150_000,
477            diagnostics: vec!["warning".to_string()],
478            quality: ExtractionQuality::Good,
479            quality_reasons: Vec::new(),
480        };
481
482        apply_quality(&mut page);
483
484        assert_eq!(page.quality.name(), "poor");
485        assert!(page
486            .quality_reasons
487            .iter()
488            .any(|reason| reason == "low_extraction_ratio"));
489    }
490
491    #[test]
492    fn detect_content_format_treats_markdown_as_markdown() {
493        assert_eq!(
494            detect_content_format("text/markdown; charset=utf-8"),
495            ContentFormat::Markdown
496        );
497    }
498
499    #[test]
500    fn detect_content_format_treats_plain_text_as_plain_text() {
501        assert_eq!(
502            detect_content_format("text/plain; charset=utf-8"),
503            ContentFormat::PlainText
504        );
505        assert_eq!(
506            detect_content_format("application/json"),
507            ContentFormat::PlainText
508        );
509    }
510
511    #[test]
512    fn markdown_and_plain_text_skip_readability_cleaning_path() {
513        let markdown = "# Title\n\n\nParagraph";
514        let cleaned_markdown = clean_text(markdown);
515        assert_eq!(cleaned_markdown, "# Title\n\n\nParagraph");
516        assert_eq!(
517            detect_content_format("text/markdown"),
518            ContentFormat::Markdown
519        );
520
521        let plain = "  hello  \n\n\nworld  ";
522        let cleaned_plain = clean_text(plain);
523        assert_eq!(cleaned_plain, "hello\n\n\nworld");
524        assert_eq!(
525            detect_content_format("text/plain"),
526            ContentFormat::PlainText
527        );
528    }
529
530    #[test]
531    fn clean_text_collapses_blank_lines() {
532        let input = "Hello\n\n\n\n\nWorld\n\nFoo";
533        let cleaned = clean_text(input);
534        // Allows up to 2 blank lines (3 newlines total), then collapses
535        assert!(cleaned.starts_with("Hello\n"));
536        assert!(cleaned.contains("World"));
537        assert!(!cleaned.contains("\n\n\n\n"));
538    }
539
540    #[test]
541    fn clean_text_trims_lines() {
542        let input = "  hello  \n  world  ";
543        let cleaned = clean_text(input);
544        assert_eq!(cleaned, "hello\nworld");
545    }
546
547    #[test]
548    fn extract_readable_from_html() {
549        let html = r#"
550        <html>
551        <head><title>Test Article</title></head>
552        <body>
553            <nav>Skip this navigation</nav>
554            <article>
555                <h1>Test Article Title</h1>
556                <p>This is the main content of the article. It has enough text to be
557                considered readable content by the readability algorithm. We need to make
558                sure there is sufficient content here for the extraction to work properly.
559                The readability algorithm looks for substantial blocks of text content.</p>
560                <p>Here is another paragraph with more substantial content to ensure that
561                the extraction algorithm has enough material to work with. This paragraph
562                adds additional context and information that would be typical in a real
563                web article about some topic.</p>
564            </article>
565            <footer>Copyright 2024</footer>
566        </body>
567        </html>"#;
568
569        let result = extract_readable(html, "https://example.com/test");
570        match result {
571            Ok(page) => {
572                assert!(page.text.contains("main content"));
573                assert!(!page.text.contains("Skip this navigation"));
574                assert_eq!(page.url, "https://example.com/test");
575                assert_eq!(page.requested_url, "https://example.com/test");
576                assert_eq!(page.status_code, 200);
577                assert!(!page.was_redirected);
578                assert_eq!(page.raw_body_bytes, 0);
579                assert!(page.content_type.is_none());
580                assert!(page.diagnostics.is_empty());
581            }
582            Err(ReadError::InsufficientContent) | Err(ReadError::NoContent) => {
583                // Readability may not extract from minimal HTML — that's acceptable
584            }
585            Err(e) => panic!("Unexpected error: {e}"),
586        }
587    }
588
589    #[test]
590    fn response_metadata_can_be_applied_after_extraction() {
591        let html = r#"
592        <html>
593        <head><title>Redirected Article</title></head>
594        <body>
595            <article>
596                <p>This article has enough body text to survive readability extraction and
597                prove that metadata can be preserved when the requested URL differs from
598                the final URL after redirects.</p>
599                <p>Additional text keeps the extractor happy and representative of a real page.</p>
600            </article>
601        </body>
602        </html>"#;
603
604        let mut page = extract_readable(html, "https://example.com/final").unwrap();
605        page.requested_url = "https://example.com/start".to_string();
606        page.status_code = 200;
607        page.content_type = Some("text/html; charset=utf-8".to_string());
608        page.format_received = ContentFormat::Html;
609        page.was_redirected = true;
610        page.raw_body_bytes = html.len();
611
612        assert_eq!(page.url, "https://example.com/final");
613        assert_eq!(page.requested_url, "https://example.com/start");
614        assert_eq!(page.status_code, 200);
615        assert_eq!(
616            page.content_type.as_deref(),
617            Some("text/html; charset=utf-8")
618        );
619        assert!(page.was_redirected);
620        assert_eq!(page.raw_body_bytes, html.len());
621    }
622
623    #[test]
624    fn diagnose_spa_shell_from_loading_text() {
625        let page = PageContent {
626            title: Some("Docs".to_string()),
627            text: "Loading documentation...".to_string(),
628            url: "https://example.com/docs".to_string(),
629            content_length: "Loading documentation...".len(),
630            requested_url: "https://example.com/docs".to_string(),
631            status_code: 200,
632            content_type: Some("text/html".to_string()),
633            format_received: ContentFormat::Html,
634            was_redirected: false,
635            raw_body_bytes: 2_000,
636            diagnostics: Vec::new(),
637            quality: ExtractionQuality::Good,
638            quality_reasons: Vec::new(),
639        };
640
641        let warnings = diagnose(
642            &page,
643            "<html><body><noscript>Enable JS</noscript></body></html>",
644        );
645        assert!(warnings.iter().any(|w| w.contains("client-rendered shell")));
646    }
647
648    #[test]
649    fn diagnose_soft_404_with_http_200() {
650        let text = "Page not found. The page has been moved.";
651        let page = PageContent {
652            title: Some("Missing".to_string()),
653            text: text.to_string(),
654            url: "https://example.com/missing".to_string(),
655            content_length: text.len(),
656            requested_url: "https://example.com/missing".to_string(),
657            status_code: 200,
658            content_type: Some("text/html".to_string()),
659            format_received: ContentFormat::Html,
660            was_redirected: false,
661            raw_body_bytes: 1_500,
662            diagnostics: Vec::new(),
663            quality: ExtractionQuality::Good,
664            quality_reasons: Vec::new(),
665        };
666
667        let warnings = diagnose(&page, "<html><body>404</body></html>");
668        assert!(warnings.iter().any(|w| w.contains("soft 404")));
669    }
670
671    #[test]
672    fn diagnose_does_not_flag_normal_page() {
673        let text = "This is a normal documentation page with enough content to explain installation, configuration, and usage in detail. It includes several paragraphs of useful information for readers and should not be treated as a shell or error page. Extra explanation here keeps it comfortably above the short-content heuristics and avoids false positives.";
674        let page = PageContent {
675            title: Some("Guide".to_string()),
676            text: text.to_string(),
677            url: "https://example.com/guide".to_string(),
678            content_length: text.len(),
679            requested_url: "https://example.com/guide".to_string(),
680            status_code: 200,
681            content_type: Some("text/html".to_string()),
682            format_received: ContentFormat::Html,
683            was_redirected: false,
684            raw_body_bytes: 8_000,
685            diagnostics: Vec::new(),
686            quality: ExtractionQuality::Good,
687            quality_reasons: Vec::new(),
688        };
689
690        let warnings = diagnose(
691            &page,
692            "<html><body><article>real docs</article></body></html>",
693        );
694        assert!(warnings.is_empty());
695    }
696
697    #[test]
698    fn diagnose_low_extraction_ratio_warning() {
699        let text = "A short extracted summary.";
700        let page = PageContent {
701            title: Some("Big Page".to_string()),
702            text: text.to_string(),
703            url: "https://example.com/big".to_string(),
704            content_length: text.len(),
705            requested_url: "https://example.com/big".to_string(),
706            status_code: 200,
707            content_type: Some("text/html".to_string()),
708            format_received: ContentFormat::Html,
709            was_redirected: false,
710            raw_body_bytes: 150_000,
711            diagnostics: Vec::new(),
712            quality: ExtractionQuality::Good,
713            quality_reasons: Vec::new(),
714        };
715
716        let warnings = diagnose(&page, "<html></html>");
717        assert!(warnings.iter().any(|w| w.contains("Large page")));
718        assert!(warnings
719            .iter()
720            .any(|w| w.contains("Significant content may have been lost")));
721    }
722}