1pub use webfetch_core::{compress, refs};
12
13pub mod convert;
14pub mod extract;
15pub mod fetch;
16pub mod guard;
17pub mod media;
18pub mod types;
19
20pub use fetch::fetch_page;
21use media::Media;
22use types::{ContentType, FetchOptions, FetchResult, Metadata};
23
24use scraper::Html;
25
26pub fn convert_html(html: &str, source_url: &str, options: &FetchOptions) -> FetchResult {
31 convert_body(html, source_url, Some("text/html"), options)
32}
33
34pub fn convert_body(
38 body: &str,
39 source_url: &str,
40 content_type_header: Option<&str>,
41 options: &FetchOptions,
42) -> FetchResult {
43 let media = media::classify(content_type_header, body);
44
45 let (title, mut content, references, metadata, output_type) = match &media {
46 Media::Html => {
47 let doc = Html::parse_document(body);
48 let title = extract::extract_title(&doc);
49 let metadata = extract::extract_metadata(&doc);
50 let converted = convert::convert(body, source_url, options.content_type);
51 (
52 title,
53 converted.content,
54 converted.references,
55 metadata,
56 options.content_type,
57 )
58 }
59 Media::Json => {
60 let pretty = serde_json::from_str::<serde_json::Value>(body)
62 .ok()
63 .and_then(|v| serde_json::to_string_pretty(&v).ok())
64 .unwrap_or_else(|| body.trim().to_string());
65 (
66 String::new(),
67 pretty,
68 Vec::new(),
69 Metadata::default(),
70 ContentType::Structured,
71 )
72 }
73 Media::Text => (
74 String::new(),
75 body.trim().to_string(),
76 Vec::new(),
77 Metadata::default(),
78 ContentType::Text,
79 ),
80 Media::Other(ct) => (
81 String::new(),
82 format!(
83 "[non-text content: {ct}, {} bytes — not rendered]",
84 body.len()
85 ),
86 Vec::new(),
87 Metadata::default(),
88 options.content_type,
89 ),
90 };
91
92 content = strip_duplicate_title(&title, content);
95
96 if let Some(max) = options.max_tokens {
97 let refs_block = if output_type == ContentType::Text {
101 convert::text::render_references(&references)
102 } else {
103 String::new()
104 };
105 content = compress::truncate_preserving_refs(&content, &refs_block, max);
106 }
107
108 FetchResult {
109 token_estimate: compress::estimate_tokens(&content),
110 title,
111 final_url: source_url.to_string(),
112 content,
113 content_type: output_type,
114 media: media.label(),
115 references,
116 metadata,
117 source: source_url.to_string(),
118 }
119}
120
121fn strip_duplicate_title(title: &str, content: String) -> String {
126 if title.is_empty() {
127 return content;
128 }
129 let mut parts = content.splitn(2, '\n');
130 let first = parts.next().unwrap_or("");
131 if compress::compress_text(first) == compress::compress_text(title) {
132 return parts
133 .next()
134 .unwrap_or("")
135 .trim_start_matches('\n')
136 .to_string();
137 }
138 content
139}
140
141pub async fn fetch_and_convert(options: FetchOptions) -> anyhow::Result<FetchResult> {
143 let page = fetch::fetch_page(&options.url, options.timeout_secs).await?;
144 let mut result = convert_body(
145 &page.body,
146 &page.final_url,
147 page.content_type.as_deref(),
148 &options,
149 );
150 result.final_url = page.final_url;
151 Ok(result)
152}
153
154pub fn parse_content_type(s: &str) -> ContentType {
156 ContentType::parse(s)
157}