1pub use webfetch_core::{compress, refs};
12
13pub mod convert;
14pub mod extract;
15pub mod fetch;
16pub mod guard;
17pub mod media;
18pub mod types;
19
20pub use fetch::fetch_page;
21use media::Media;
22use types::{ContentType, FetchOptions, FetchResult, Metadata};
23
24use scraper::Html;
25
26pub fn convert_html(html: &str, source_url: &str, options: &FetchOptions) -> FetchResult {
31 convert_body(html, source_url, Some("text/html"), options)
32}
33
34pub fn convert_body(
38 body: &str,
39 source_url: &str,
40 content_type_header: Option<&str>,
41 options: &FetchOptions,
42) -> FetchResult {
43 let media = media::classify(content_type_header, body);
44
45 let (title, mut content, references, metadata, output_type) = match &media {
46 Media::Html => {
47 let doc = Html::parse_document(body);
48 let title = extract::extract_title(&doc);
49 let metadata = extract::extract_metadata(&doc);
50 let converted = convert::convert(body, source_url, options.content_type);
51 (
52 title,
53 converted.content,
54 converted.references,
55 metadata,
56 options.content_type,
57 )
58 }
59 Media::Json => {
60 let pretty = serde_json::from_str::<serde_json::Value>(body)
62 .ok()
63 .and_then(|v| serde_json::to_string_pretty(&v).ok())
64 .unwrap_or_else(|| body.trim().to_string());
65 (
66 String::new(),
67 pretty,
68 Vec::new(),
69 Metadata::default(),
70 ContentType::Structured,
71 )
72 }
73 Media::Text => (
74 String::new(),
75 body.trim().to_string(),
76 Vec::new(),
77 Metadata::default(),
78 ContentType::Text,
79 ),
80 Media::Other(ct) => (
81 String::new(),
82 format!(
83 "[non-text content: {ct}, {} bytes — not rendered]",
84 body.len()
85 ),
86 Vec::new(),
87 Metadata::default(),
88 options.content_type,
89 ),
90 };
91
92 if let Some(max) = options.max_tokens {
93 content = compress::truncate_to_tokens(&content, max);
94 }
95
96 FetchResult {
97 token_estimate: compress::estimate_tokens(&content),
98 title,
99 final_url: source_url.to_string(),
100 content,
101 content_type: output_type,
102 media: media.label(),
103 references,
104 metadata,
105 source: source_url.to_string(),
106 }
107}
108
109pub async fn fetch_and_convert(options: FetchOptions) -> anyhow::Result<FetchResult> {
111 let page = fetch::fetch_page(&options.url, options.timeout_secs).await?;
112 let mut result = convert_body(
113 &page.body,
114 &page.final_url,
115 page.content_type.as_deref(),
116 &options,
117 );
118 result.final_url = page.final_url;
119 Ok(result)
120}
121
122pub fn parse_content_type(s: &str) -> ContentType {
124 ContentType::parse(s)
125}