1pub use webfetch_core::{compress, refs};
12
13pub mod convert;
14pub mod extract;
15pub mod fetch;
16pub mod media;
17pub mod types;
18
19pub use fetch::fetch_page;
20use media::Media;
21use types::{ContentType, FetchOptions, FetchResult, Metadata};
22
23use scraper::Html;
24
25pub fn convert_html(html: &str, source_url: &str, options: &FetchOptions) -> FetchResult {
30 convert_body(html, source_url, Some("text/html"), options)
31}
32
33pub fn convert_body(
37 body: &str,
38 source_url: &str,
39 content_type_header: Option<&str>,
40 options: &FetchOptions,
41) -> FetchResult {
42 let media = media::classify(content_type_header, body);
43
44 let (title, mut content, references, metadata, output_type) = match &media {
45 Media::Html => {
46 let doc = Html::parse_document(body);
47 let title = extract::extract_title(&doc);
48 let metadata = extract::extract_metadata(&doc);
49 let converted = convert::convert(body, source_url, options.content_type);
50 (
51 title,
52 converted.content,
53 converted.references,
54 metadata,
55 options.content_type,
56 )
57 }
58 Media::Json => {
59 let pretty = serde_json::from_str::<serde_json::Value>(body)
61 .ok()
62 .and_then(|v| serde_json::to_string_pretty(&v).ok())
63 .unwrap_or_else(|| body.trim().to_string());
64 (
65 String::new(),
66 pretty,
67 Vec::new(),
68 Metadata::default(),
69 ContentType::Structured,
70 )
71 }
72 Media::Text => (
73 String::new(),
74 body.trim().to_string(),
75 Vec::new(),
76 Metadata::default(),
77 ContentType::Text,
78 ),
79 Media::Other(ct) => (
80 String::new(),
81 format!(
82 "[non-text content: {ct}, {} bytes — not rendered]",
83 body.len()
84 ),
85 Vec::new(),
86 Metadata::default(),
87 options.content_type,
88 ),
89 };
90
91 if let Some(max) = options.max_tokens {
92 content = compress::truncate_to_tokens(&content, max);
93 }
94
95 FetchResult {
96 token_estimate: compress::estimate_tokens(&content),
97 title,
98 final_url: source_url.to_string(),
99 content,
100 content_type: output_type,
101 media: media.label(),
102 references,
103 metadata,
104 source: source_url.to_string(),
105 }
106}
107
108pub async fn fetch_and_convert(options: FetchOptions) -> anyhow::Result<FetchResult> {
110 let page = fetch::fetch_page(&options.url, options.timeout_secs).await?;
111 let mut result = convert_body(
112 &page.body,
113 &page.final_url,
114 page.content_type.as_deref(),
115 &options,
116 );
117 result.final_url = page.final_url;
118 Ok(result)
119}
120
121pub fn parse_content_type(s: &str) -> ContentType {
123 ContentType::parse(s)
124}