Skip to main content

webfetch/
lib.rs

1//! webfetch — token-efficient web content fetcher.
2//!
3//! The defining feature is **reference-style URL preservation**: instead of
4//! stripping links to their domain (losing the ability to cite or follow
5//! them) or expanding full URLs inline (wasting tokens), links are replaced
6//! with compact `[N]` markers and collected into a recoverable reference list.
7
8// Shared primitives live in webfetch-core; re-export them so both this
9// crate's internal modules (via `crate::compress` / `crate::refs`) and
10// external callers keep a stable path.
11pub use webfetch_core::{compress, refs};
12
13pub mod convert;
14pub mod extract;
15pub mod fetch;
16pub mod media;
17pub mod types;
18
19pub use fetch::fetch_page;
20use media::Media;
21use types::{ContentType, FetchOptions, FetchResult, Metadata};
22
23use scraper::Html;
24
25/// Convert already-fetched HTML into a [`FetchResult`] without any network I/O.
26///
27/// Useful for tests and for callers that obtain HTML by other means. Always
28/// treats the input as HTML; use [`convert_body`] for media-aware handling.
29pub fn convert_html(html: &str, source_url: &str, options: &FetchOptions) -> FetchResult {
30    convert_body(html, source_url, Some("text/html"), options)
31}
32
33/// Convert a fetched body to a [`FetchResult`], choosing how to treat it based
34/// on its `Content-Type` (or a sniff of the body). HTML is extracted; JSON is
35/// pretty-printed; other text is passed through verbatim; binary is summarized.
36pub fn convert_body(
37    body: &str,
38    source_url: &str,
39    content_type_header: Option<&str>,
40    options: &FetchOptions,
41) -> FetchResult {
42    let media = media::classify(content_type_header, body);
43
44    let (title, mut content, references, metadata, output_type) = match &media {
45        Media::Html => {
46            let doc = Html::parse_document(body);
47            let title = extract::extract_title(&doc);
48            let metadata = extract::extract_metadata(&doc);
49            let converted = convert::convert(body, source_url, options.content_type);
50            (
51                title,
52                converted.content,
53                converted.references,
54                metadata,
55                options.content_type,
56            )
57        }
58        Media::Json => {
59            // Pretty-print so an agent reads clean JSON; fall back to raw.
60            let pretty = serde_json::from_str::<serde_json::Value>(body)
61                .ok()
62                .and_then(|v| serde_json::to_string_pretty(&v).ok())
63                .unwrap_or_else(|| body.trim().to_string());
64            (
65                String::new(),
66                pretty,
67                Vec::new(),
68                Metadata::default(),
69                ContentType::Structured,
70            )
71        }
72        Media::Text => (
73            String::new(),
74            body.trim().to_string(),
75            Vec::new(),
76            Metadata::default(),
77            ContentType::Text,
78        ),
79        Media::Other(ct) => (
80            String::new(),
81            format!(
82                "[non-text content: {ct}, {} bytes — not rendered]",
83                body.len()
84            ),
85            Vec::new(),
86            Metadata::default(),
87            options.content_type,
88        ),
89    };
90
91    if let Some(max) = options.max_tokens {
92        content = compress::truncate_to_tokens(&content, max);
93    }
94
95    FetchResult {
96        token_estimate: compress::estimate_tokens(&content),
97        title,
98        final_url: source_url.to_string(),
99        content,
100        content_type: output_type,
101        media: media.label(),
102        references,
103        metadata,
104        source: source_url.to_string(),
105    }
106}
107
108/// Fetch a URL and convert it according to `options`.
109pub async fn fetch_and_convert(options: FetchOptions) -> anyhow::Result<FetchResult> {
110    let page = fetch::fetch_page(&options.url, options.timeout_secs).await?;
111    let mut result = convert_body(
112        &page.body,
113        &page.final_url,
114        page.content_type.as_deref(),
115        &options,
116    );
117    result.final_url = page.final_url;
118    Ok(result)
119}
120
121/// Parse a content-type string ("text" | "markdown" | "structured").
122pub fn parse_content_type(s: &str) -> ContentType {
123    ContentType::parse(s)
124}