Skip to main content

webfetch/
lib.rs

1//! webfetch — token-efficient web content fetcher.
2//!
3//! The defining feature is **reference-style URL preservation**: instead of
4//! stripping links to their domain (losing the ability to cite or follow
5//! them) or expanding full URLs inline (wasting tokens), links are replaced
6//! with compact `[N]` markers and collected into a recoverable reference list.
7
8// Shared primitives live in webfetch-core; re-export them so both this
9// crate's internal modules (via `crate::compress` / `crate::refs`) and
10// external callers keep a stable path.
11pub use webfetch_core::{compress, refs};
12
13pub mod convert;
14pub mod extract;
15pub mod fetch;
16pub mod guard;
17pub mod media;
18pub mod types;
19
20pub use fetch::fetch_page;
21use media::Media;
22use types::{ContentType, FetchOptions, FetchResult, Metadata};
23
24use scraper::Html;
25
26/// Convert already-fetched HTML into a [`FetchResult`] without any network I/O.
27///
28/// Useful for tests and for callers that obtain HTML by other means. Always
29/// treats the input as HTML; use [`convert_body`] for media-aware handling.
30pub fn convert_html(html: &str, source_url: &str, options: &FetchOptions) -> FetchResult {
31    convert_body(html, source_url, Some("text/html"), options)
32}
33
34/// Convert a fetched body to a [`FetchResult`], choosing how to treat it based
35/// on its `Content-Type` (or a sniff of the body). HTML is extracted; JSON is
36/// pretty-printed; other text is passed through verbatim; binary is summarized.
37pub fn convert_body(
38    body: &str,
39    source_url: &str,
40    content_type_header: Option<&str>,
41    options: &FetchOptions,
42) -> FetchResult {
43    let media = media::classify(content_type_header, body);
44
45    let (title, mut content, references, metadata, output_type) = match &media {
46        Media::Html => {
47            let doc = Html::parse_document(body);
48            let title = extract::extract_title(&doc);
49            let metadata = extract::extract_metadata(&doc);
50            let converted = convert::convert(body, source_url, options.content_type);
51            (
52                title,
53                converted.content,
54                converted.references,
55                metadata,
56                options.content_type,
57            )
58        }
59        Media::Json => {
60            // Pretty-print so an agent reads clean JSON; fall back to raw.
61            let pretty = serde_json::from_str::<serde_json::Value>(body)
62                .ok()
63                .and_then(|v| serde_json::to_string_pretty(&v).ok())
64                .unwrap_or_else(|| body.trim().to_string());
65            (
66                String::new(),
67                pretty,
68                Vec::new(),
69                Metadata::default(),
70                ContentType::Structured,
71            )
72        }
73        Media::Text => (
74            String::new(),
75            body.trim().to_string(),
76            Vec::new(),
77            Metadata::default(),
78            ContentType::Text,
79        ),
80        Media::Other(ct) => (
81            String::new(),
82            format!(
83                "[non-text content: {ct}, {} bytes — not rendered]",
84                body.len()
85            ),
86            Vec::new(),
87            Metadata::default(),
88            options.content_type,
89        ),
90    };
91
92    if let Some(max) = options.max_tokens {
93        content = compress::truncate_to_tokens(&content, max);
94    }
95
96    FetchResult {
97        token_estimate: compress::estimate_tokens(&content),
98        title,
99        final_url: source_url.to_string(),
100        content,
101        content_type: output_type,
102        media: media.label(),
103        references,
104        metadata,
105        source: source_url.to_string(),
106    }
107}
108
109/// Fetch a URL and convert it according to `options`.
110pub async fn fetch_and_convert(options: FetchOptions) -> anyhow::Result<FetchResult> {
111    let page = fetch::fetch_page(&options.url, options.timeout_secs).await?;
112    let mut result = convert_body(
113        &page.body,
114        &page.final_url,
115        page.content_type.as_deref(),
116        &options,
117    );
118    result.final_url = page.final_url;
119    Ok(result)
120}
121
122/// Parse a content-type string ("text" | "markdown" | "structured").
123pub fn parse_content_type(s: &str) -> ContentType {
124    ContentType::parse(s)
125}