Skip to main content

imp_core/tools/web/
read.rs

1//! Native page reading — fetch HTML via reqwest + extract with readability.
2//!
3//! No external APIs needed for reading pages. Handles most static and
4//! server-rendered pages. Won't work for heavy SPAs that require JS execution.
5
6use reqwest::Client;
7use url::Url;
8
9use super::types::{ContentFormat, PageContent};
10
11/// User-Agent string that identifies as a legitimate browser to avoid blocks.
12pub(crate) const USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
13    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
14pub(crate) const ACCEPT_HEADER: &str =
15    "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5";
16
17/// Fetch a URL and extract its readable content.
18pub async fn fetch_and_extract(client: &Client, url: &str) -> Result<PageContent, ReadError> {
19    let parsed_url = Url::parse(url).map_err(|e| ReadError::InvalidUrl(e.to_string()))?;
20
21    if super::youtube::is_youtube_url(&parsed_url) {
22        return super::youtube::fetch_and_extract(client, url)
23            .await
24            .map_err(|err| ReadError::Youtube(err.to_string()));
25    }
26
27    let requested_url = url.to_string();
28
29    let response = client
30        .get(url)
31        .header("User-Agent", USER_AGENT)
32        .header("Accept", ACCEPT_HEADER)
33        .header("Accept-Language", "en-US,en;q=0.9")
34        .send()
35        .await
36        .map_err(|e| ReadError::Fetch(e.to_string()))?;
37
38    let status_code = response.status().as_u16();
39    if !response.status().is_success() {
40        return Err(ReadError::HttpStatus(
41            status_code,
42            response
43                .status()
44                .canonical_reason()
45                .unwrap_or("Unknown")
46                .to_string(),
47        ));
48    }
49
50    let content_type = response
51        .headers()
52        .get("content-type")
53        .and_then(|v| v.to_str().ok())
54        .unwrap_or("")
55        .to_string();
56
57    let format_received = detect_content_format(&content_type);
58
59    // Reject binary content types (images, video, audio, etc.)
60    let is_text = content_type.is_empty()
61        || content_type.contains("text/")
62        || content_type.contains("application/json")
63        || content_type.contains("application/xml")
64        || content_type.contains("application/xhtml")
65        || content_type.contains("application/javascript")
66        || content_type.contains("+xml")
67        || content_type.contains("+json");
68    if !is_text {
69        return Err(ReadError::NotHtml(content_type));
70    }
71
72    let final_url = response.url().to_string();
73    let was_redirected = final_url != requested_url;
74    let html = response
75        .text()
76        .await
77        .map_err(|e| ReadError::Fetch(e.to_string()))?;
78    let raw_body_bytes = html.len();
79
80    if html.len() < 100 {
81        return Err(ReadError::InsufficientContent);
82    }
83
84    // Shared metadata for all paths
85    let meta = ResponseMeta {
86        requested_url,
87        status_code,
88        content_type: if content_type.is_empty() {
89            None
90        } else {
91            Some(content_type.clone())
92        },
93        format_received,
94        was_redirected,
95        raw_body_bytes,
96    };
97
98    match format_received {
99        ContentFormat::Markdown | ContentFormat::PlainText => {
100            let cleaned = clean_text(&html);
101            let mut page = PageContent {
102                title: None,
103                content_length: cleaned.len(),
104                text: cleaned,
105                url: final_url,
106                requested_url: meta.requested_url,
107                status_code: meta.status_code,
108                content_type: meta.content_type,
109                format_received: meta.format_received,
110                was_redirected: meta.was_redirected,
111                raw_body_bytes: meta.raw_body_bytes,
112                diagnostics: Vec::new(),
113            };
114            page.diagnostics = diagnose(&page, "");
115            Ok(page)
116        }
117        ContentFormat::Html => {
118            let mut page = extract_readable(&html, &final_url)?;
119            page.requested_url = meta.requested_url;
120            page.status_code = meta.status_code;
121            page.content_type = meta.content_type;
122            page.format_received = meta.format_received;
123            page.was_redirected = meta.was_redirected;
124            page.raw_body_bytes = meta.raw_body_bytes;
125            page.diagnostics = diagnose(&page, &html);
126            Ok(page)
127        }
128    }
129}
130
131/// Metadata captured from the HTTP response before extraction.
132struct ResponseMeta {
133    requested_url: String,
134    status_code: u16,
135    content_type: Option<String>,
136    format_received: ContentFormat,
137    was_redirected: bool,
138    raw_body_bytes: usize,
139}
140
141/// Extract readable content from raw HTML using Mozilla Readability algorithm.
142fn extract_readable(html: &str, url: &str) -> Result<PageContent, ReadError> {
143    use readability_rust::Readability;
144
145    let mut parser = Readability::new_with_base_uri(html, url, None)
146        .map_err(|e| ReadError::Parse(format!("{e}")))?;
147
148    let article = parser.parse().ok_or(ReadError::NoContent)?;
149
150    let title = article.title.clone();
151
152    // article.text_content is the cleaned plain text
153    // article.content is HTML — we convert to plain text ourselves for safety
154    let text = article
155        .text_content
156        .as_deref()
157        .or(article.content.as_deref())
158        .unwrap_or("")
159        .to_string();
160
161    if text.len() < 50 {
162        return Err(ReadError::InsufficientContent);
163    }
164
165    Ok(PageContent {
166        content_length: text.len(),
167        title,
168        text: clean_text(&text),
169        url: url.to_string(),
170        // Populated by caller (fetch_and_extract) after extraction
171        requested_url: url.to_string(),
172        status_code: 200,
173        content_type: None,
174        format_received: ContentFormat::Html,
175        was_redirected: false,
176        raw_body_bytes: 0,
177        diagnostics: Vec::new(),
178    })
179}
180
181pub fn diagnose(page: &PageContent, raw_html: &str) -> Vec<String> {
182    let mut warnings = Vec::new();
183    let text_lower = page.text.to_lowercase();
184    let html_lower = raw_html.to_lowercase();
185
186    let short_text = page.content_length < 500;
187    let has_loading_indicator = ["loading...", "loading documentation"]
188        .iter()
189        .any(|needle| text_lower.contains(needle));
190    let has_noscript = html_lower.contains("<noscript");
191    let nav_link_count = html_lower.matches("<nav").count()
192        + html_lower.matches("<a ").count()
193        + html_lower.matches("<a>").count();
194    let has_nav_shell_pattern = short_text && nav_link_count >= 8;
195    if short_text && (has_loading_indicator || has_noscript || has_nav_shell_pattern) {
196        warnings.push(
197            "Page appears to be a client-rendered shell. Content may require JavaScript."
198                .to_string(),
199        );
200    }
201
202    let very_short_text = page.content_length < 300;
203    let has_soft_404_indicator = [
204        "page not found",
205        "can't find that page",
206        "404",
207        "doesn't exist",
208        "has been moved",
209    ]
210    .iter()
211    .any(|needle| text_lower.contains(needle));
212    if page.status_code == 200 && very_short_text && has_soft_404_indicator {
213        warnings
214            .push("Page appears to be a soft 404 (HTTP 200 but error page content).".to_string());
215    }
216
217    if page.raw_body_bytes > 20 * 1024 && page.content_length < 2 * 1024 {
218        warnings.push(format!(
219            "Large page ({} bytes) but only {} chars extracted. Content may be incomplete.",
220            page.raw_body_bytes, page.content_length
221        ));
222    }
223
224    if page.raw_body_bytes > 100 * 1024
225        && (page.content_length as f64) < (page.raw_body_bytes as f64 * 0.1)
226    {
227        let pct = ((page.content_length as f64 / page.raw_body_bytes as f64) * 100.0).round();
228        warnings.push(format!(
229            "Significant content may have been lost during extraction ({}% of response retained).",
230            pct as usize
231        ));
232    }
233
234    warnings
235}
236
237/// Clean extracted text: normalize whitespace, remove excessive blank lines.
238fn clean_text(text: &str) -> String {
239    let mut result = String::with_capacity(text.len());
240    let mut blank_count = 0u32;
241
242    for line in text.lines() {
243        let trimmed = line.trim();
244        if trimmed.is_empty() {
245            blank_count += 1;
246            if blank_count <= 2 {
247                result.push('\n');
248            }
249        } else {
250            blank_count = 0;
251            result.push_str(trimmed);
252            result.push('\n');
253        }
254    }
255
256    result.trim().to_string()
257}
258
259fn detect_content_format(content_type: &str) -> ContentFormat {
260    let content_type = content_type.to_ascii_lowercase();
261
262    if content_type.contains("text/markdown") || content_type.contains("text/x-markdown") {
263        ContentFormat::Markdown
264    } else if content_type.contains("text/html") || content_type.contains("application/xhtml+xml") {
265        ContentFormat::Html
266    } else {
267        ContentFormat::PlainText
268    }
269}
270
271#[derive(Debug)]
272pub enum ReadError {
273    InvalidUrl(String),
274    Fetch(String),
275    HttpStatus(u16, String),
276    NotHtml(String),
277    Parse(String),
278    NoContent,
279    InsufficientContent,
280    Youtube(String),
281}
282
283impl std::fmt::Display for ReadError {
284    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285        match self {
286            Self::InvalidUrl(msg) => write!(f, "Invalid URL: {msg}"),
287            Self::Fetch(msg) => write!(f, "Fetch failed: {msg}"),
288            Self::HttpStatus(code, reason) => write!(f, "HTTP {code} {reason}"),
289            Self::NotHtml(ct) => write!(f, "Not an HTML page (content-type: {ct})"),
290            Self::Parse(msg) => write!(f, "Parse error: {msg}"),
291            Self::NoContent => write!(f, "Could not extract readable content from page"),
292            Self::InsufficientContent => write!(f, "Page returned insufficient content"),
293            Self::Youtube(msg) => write!(f, "YouTube extraction failed: {msg}"),
294        }
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301
302    #[test]
303    fn accept_header_prefers_markdown() {
304        assert_eq!(
305            ACCEPT_HEADER,
306            "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5"
307        );
308    }
309
310    #[test]
311    fn detect_content_format_treats_markdown_as_markdown() {
312        assert_eq!(
313            detect_content_format("text/markdown; charset=utf-8"),
314            ContentFormat::Markdown
315        );
316    }
317
318    #[test]
319    fn detect_content_format_treats_plain_text_as_plain_text() {
320        assert_eq!(
321            detect_content_format("text/plain; charset=utf-8"),
322            ContentFormat::PlainText
323        );
324        assert_eq!(
325            detect_content_format("application/json"),
326            ContentFormat::PlainText
327        );
328    }
329
330    #[test]
331    fn markdown_and_plain_text_skip_readability_cleaning_path() {
332        let markdown = "# Title\n\n\nParagraph";
333        let cleaned_markdown = clean_text(markdown);
334        assert_eq!(cleaned_markdown, "# Title\n\n\nParagraph");
335        assert_eq!(
336            detect_content_format("text/markdown"),
337            ContentFormat::Markdown
338        );
339
340        let plain = "  hello  \n\n\nworld  ";
341        let cleaned_plain = clean_text(plain);
342        assert_eq!(cleaned_plain, "hello\n\n\nworld");
343        assert_eq!(
344            detect_content_format("text/plain"),
345            ContentFormat::PlainText
346        );
347    }
348
349    #[test]
350    fn clean_text_collapses_blank_lines() {
351        let input = "Hello\n\n\n\n\nWorld\n\nFoo";
352        let cleaned = clean_text(input);
353        // Allows up to 2 blank lines (3 newlines total), then collapses
354        assert!(cleaned.starts_with("Hello\n"));
355        assert!(cleaned.contains("World"));
356        assert!(!cleaned.contains("\n\n\n\n"));
357    }
358
359    #[test]
360    fn clean_text_trims_lines() {
361        let input = "  hello  \n  world  ";
362        let cleaned = clean_text(input);
363        assert_eq!(cleaned, "hello\nworld");
364    }
365
366    #[test]
367    fn extract_readable_from_html() {
368        let html = r#"
369        <html>
370        <head><title>Test Article</title></head>
371        <body>
372            <nav>Skip this navigation</nav>
373            <article>
374                <h1>Test Article Title</h1>
375                <p>This is the main content of the article. It has enough text to be
376                considered readable content by the readability algorithm. We need to make
377                sure there is sufficient content here for the extraction to work properly.
378                The readability algorithm looks for substantial blocks of text content.</p>
379                <p>Here is another paragraph with more substantial content to ensure that
380                the extraction algorithm has enough material to work with. This paragraph
381                adds additional context and information that would be typical in a real
382                web article about some topic.</p>
383            </article>
384            <footer>Copyright 2024</footer>
385        </body>
386        </html>"#;
387
388        let result = extract_readable(html, "https://example.com/test");
389        match result {
390            Ok(page) => {
391                assert!(page.text.contains("main content"));
392                assert!(!page.text.contains("Skip this navigation"));
393                assert_eq!(page.url, "https://example.com/test");
394                assert_eq!(page.requested_url, "https://example.com/test");
395                assert_eq!(page.status_code, 200);
396                assert!(!page.was_redirected);
397                assert_eq!(page.raw_body_bytes, 0);
398                assert!(page.content_type.is_none());
399                assert!(page.diagnostics.is_empty());
400            }
401            Err(ReadError::InsufficientContent) | Err(ReadError::NoContent) => {
402                // Readability may not extract from minimal HTML — that's acceptable
403            }
404            Err(e) => panic!("Unexpected error: {e}"),
405        }
406    }
407
408    #[test]
409    fn response_metadata_can_be_applied_after_extraction() {
410        let html = r#"
411        <html>
412        <head><title>Redirected Article</title></head>
413        <body>
414            <article>
415                <p>This article has enough body text to survive readability extraction and
416                prove that metadata can be preserved when the requested URL differs from
417                the final URL after redirects.</p>
418                <p>Additional text keeps the extractor happy and representative of a real page.</p>
419            </article>
420        </body>
421        </html>"#;
422
423        let mut page = extract_readable(html, "https://example.com/final").unwrap();
424        page.requested_url = "https://example.com/start".to_string();
425        page.status_code = 200;
426        page.content_type = Some("text/html; charset=utf-8".to_string());
427        page.format_received = ContentFormat::Html;
428        page.was_redirected = true;
429        page.raw_body_bytes = html.len();
430
431        assert_eq!(page.url, "https://example.com/final");
432        assert_eq!(page.requested_url, "https://example.com/start");
433        assert_eq!(page.status_code, 200);
434        assert_eq!(
435            page.content_type.as_deref(),
436            Some("text/html; charset=utf-8")
437        );
438        assert!(page.was_redirected);
439        assert_eq!(page.raw_body_bytes, html.len());
440    }
441
442    #[test]
443    fn diagnose_spa_shell_from_loading_text() {
444        let page = PageContent {
445            title: Some("Docs".to_string()),
446            text: "Loading documentation...".to_string(),
447            url: "https://example.com/docs".to_string(),
448            content_length: "Loading documentation...".len(),
449            requested_url: "https://example.com/docs".to_string(),
450            status_code: 200,
451            content_type: Some("text/html".to_string()),
452            format_received: ContentFormat::Html,
453            was_redirected: false,
454            raw_body_bytes: 2_000,
455            diagnostics: Vec::new(),
456        };
457
458        let warnings = diagnose(
459            &page,
460            "<html><body><noscript>Enable JS</noscript></body></html>",
461        );
462        assert!(warnings.iter().any(|w| w.contains("client-rendered shell")));
463    }
464
465    #[test]
466    fn diagnose_soft_404_with_http_200() {
467        let text = "Page not found. The page has been moved.";
468        let page = PageContent {
469            title: Some("Missing".to_string()),
470            text: text.to_string(),
471            url: "https://example.com/missing".to_string(),
472            content_length: text.len(),
473            requested_url: "https://example.com/missing".to_string(),
474            status_code: 200,
475            content_type: Some("text/html".to_string()),
476            format_received: ContentFormat::Html,
477            was_redirected: false,
478            raw_body_bytes: 1_500,
479            diagnostics: Vec::new(),
480        };
481
482        let warnings = diagnose(&page, "<html><body>404</body></html>");
483        assert!(warnings.iter().any(|w| w.contains("soft 404")));
484    }
485
486    #[test]
487    fn diagnose_does_not_flag_normal_page() {
488        let text = "This is a normal documentation page with enough content to explain installation, configuration, and usage in detail. It includes several paragraphs of useful information for readers and should not be treated as a shell or error page. Extra explanation here keeps it comfortably above the short-content heuristics and avoids false positives.";
489        let page = PageContent {
490            title: Some("Guide".to_string()),
491            text: text.to_string(),
492            url: "https://example.com/guide".to_string(),
493            content_length: text.len(),
494            requested_url: "https://example.com/guide".to_string(),
495            status_code: 200,
496            content_type: Some("text/html".to_string()),
497            format_received: ContentFormat::Html,
498            was_redirected: false,
499            raw_body_bytes: 8_000,
500            diagnostics: Vec::new(),
501        };
502
503        let warnings = diagnose(
504            &page,
505            "<html><body><article>real docs</article></body></html>",
506        );
507        assert!(warnings.is_empty());
508    }
509
510    #[test]
511    fn diagnose_low_extraction_ratio_warning() {
512        let text = "A short extracted summary.";
513        let page = PageContent {
514            title: Some("Big Page".to_string()),
515            text: text.to_string(),
516            url: "https://example.com/big".to_string(),
517            content_length: text.len(),
518            requested_url: "https://example.com/big".to_string(),
519            status_code: 200,
520            content_type: Some("text/html".to_string()),
521            format_received: ContentFormat::Html,
522            was_redirected: false,
523            raw_body_bytes: 150_000,
524            diagnostics: Vec::new(),
525        };
526
527        let warnings = diagnose(&page, "<html></html>");
528        assert!(warnings.iter().any(|w| w.contains("Large page")));
529        assert!(warnings
530            .iter()
531            .any(|w| w.contains("Significant content may have been lost")));
532    }
533}