Skip to main content

webfetch/
lib.rs

1//! webfetch — token-efficient web content fetcher.
2//!
3//! The defining feature is **reference-style URL preservation**: instead of
4//! stripping links to their domain (losing the ability to cite or follow
5//! them) or expanding full URLs inline (wasting tokens), links are replaced
6//! with compact `[N]` markers and collected into a recoverable reference list.
7
8// Shared primitives live in webfetch-core; re-export them so both this
9// crate's internal modules (via `crate::compress` / `crate::refs`) and
10// external callers keep a stable path.
11pub use webfetch_core::{compress, refs};
12
13pub mod convert;
14pub mod extract;
15pub mod fetch;
16pub mod guard;
17pub mod media;
18pub mod types;
19
20pub use fetch::fetch_page;
21use media::Media;
22use types::{ContentType, FetchOptions, FetchResult, Metadata};
23
24use scraper::Html;
25
26/// Convert already-fetched HTML into a [`FetchResult`] without any network I/O.
27///
28/// Useful for tests and for callers that obtain HTML by other means. Always
29/// treats the input as HTML; use [`convert_body`] for media-aware handling.
30pub fn convert_html(html: &str, source_url: &str, options: &FetchOptions) -> FetchResult {
31    convert_body(html, source_url, Some("text/html"), options)
32}
33
34/// Convert a fetched body to a [`FetchResult`], choosing how to treat it based
35/// on its `Content-Type` (or a sniff of the body). HTML is extracted; JSON is
36/// pretty-printed; other text is passed through verbatim; binary is summarized.
37pub fn convert_body(
38    body: &str,
39    source_url: &str,
40    content_type_header: Option<&str>,
41    options: &FetchOptions,
42) -> FetchResult {
43    let media = media::classify(content_type_header, body);
44
45    let (title, mut content, references, metadata, output_type) = match &media {
46        Media::Html => {
47            let doc = Html::parse_document(body);
48            let title = extract::extract_title(&doc);
49            let metadata = extract::extract_metadata(&doc);
50            let converted = convert::convert(body, source_url, options.content_type);
51            (
52                title,
53                converted.content,
54                converted.references,
55                metadata,
56                options.content_type,
57            )
58        }
59        Media::Json => {
60            // Pretty-print so an agent reads clean JSON; fall back to raw.
61            let pretty = serde_json::from_str::<serde_json::Value>(body)
62                .ok()
63                .and_then(|v| serde_json::to_string_pretty(&v).ok())
64                .unwrap_or_else(|| body.trim().to_string());
65            (
66                String::new(),
67                pretty,
68                Vec::new(),
69                Metadata::default(),
70                ContentType::Structured,
71            )
72        }
73        Media::Text => (
74            String::new(),
75            body.trim().to_string(),
76            Vec::new(),
77            Metadata::default(),
78            ContentType::Text,
79        ),
80        Media::Other(ct) => (
81            String::new(),
82            format!(
83                "[non-text content: {ct}, {} bytes — not rendered]",
84                body.len()
85            ),
86            Vec::new(),
87            Metadata::default(),
88            options.content_type,
89        ),
90    };
91
92    // Drop a leading body line that merely repeats the title (common when the
93    // title was derived from the page's first <h1>, which also opens the body).
94    content = strip_duplicate_title(&title, content);
95
96    if let Some(max) = options.max_tokens {
97        // In reference-style text output the `References:` block is appended to
98        // the end of `content`; truncate the body but keep that block intact so
99        // inline `[N]` markers still resolve (see truncate_preserving_refs).
100        let refs_block = if output_type == ContentType::Text {
101            convert::text::render_references(&references)
102        } else {
103            String::new()
104        };
105        content = compress::truncate_preserving_refs(&content, &refs_block, max);
106    }
107
108    FetchResult {
109        token_estimate: compress::estimate_tokens(&content),
110        title,
111        final_url: source_url.to_string(),
112        content,
113        content_type: output_type,
114        media: media.label(),
115        references,
116        metadata,
117        source: source_url.to_string(),
118    }
119}
120
121/// When the title was derived from the page's first heading, the body repeats
122/// it as its opening line. Drop that leading line when it normalizes to the
123/// same text as `title`. Conservative: only an exact normalized match of the
124/// *first* line is removed, so genuine content is never lost.
125fn strip_duplicate_title(title: &str, content: String) -> String {
126    if title.is_empty() {
127        return content;
128    }
129    let mut parts = content.splitn(2, '\n');
130    let first = parts.next().unwrap_or("");
131    if compress::compress_text(first) == compress::compress_text(title) {
132        return parts
133            .next()
134            .unwrap_or("")
135            .trim_start_matches('\n')
136            .to_string();
137    }
138    content
139}
140
141/// Fetch a URL and convert it according to `options`.
142pub async fn fetch_and_convert(options: FetchOptions) -> anyhow::Result<FetchResult> {
143    let page = fetch::fetch_page(&options.url, options.timeout_secs).await?;
144    let mut result = convert_body(
145        &page.body,
146        &page.final_url,
147        page.content_type.as_deref(),
148        &options,
149    );
150    result.final_url = page.final_url;
151    Ok(result)
152}
153
154/// Parse a content-type string ("text" | "markdown" | "structured").
155pub fn parse_content_type(s: &str) -> ContentType {
156    ContentType::parse(s)
157}