pub use webfetch_core::{compress, refs};
pub mod convert;
pub mod extract;
pub mod fetch;
pub mod guard;
pub mod media;
pub mod types;
pub use fetch::fetch_page;
use media::Media;
use types::{ContentType, FetchOptions, FetchResult, Metadata};
use scraper::Html;
pub fn convert_html(html: &str, source_url: &str, options: &FetchOptions) -> FetchResult {
convert_body(html, source_url, Some("text/html"), options)
}
pub fn convert_body(
body: &str,
source_url: &str,
content_type_header: Option<&str>,
options: &FetchOptions,
) -> FetchResult {
let media = media::classify(content_type_header, body);
let (title, mut content, references, metadata, output_type) = match &media {
Media::Html => {
let doc = Html::parse_document(body);
let title = extract::extract_title(&doc);
let metadata = extract::extract_metadata(&doc);
let converted = convert::convert(body, source_url, options.content_type);
(
title,
converted.content,
converted.references,
metadata,
options.content_type,
)
}
Media::Json => {
let pretty = serde_json::from_str::<serde_json::Value>(body)
.ok()
.and_then(|v| serde_json::to_string_pretty(&v).ok())
.unwrap_or_else(|| body.trim().to_string());
(
String::new(),
pretty,
Vec::new(),
Metadata::default(),
ContentType::Structured,
)
}
Media::Text => (
String::new(),
body.trim().to_string(),
Vec::new(),
Metadata::default(),
ContentType::Text,
),
Media::Other(ct) => (
String::new(),
format!(
"[non-text content: {ct}, {} bytes — not rendered]",
body.len()
),
Vec::new(),
Metadata::default(),
options.content_type,
),
};
content = strip_duplicate_title(&title, content);
if let Some(max) = options.max_tokens {
let refs_block = if output_type == ContentType::Text {
convert::text::render_references(&references)
} else {
String::new()
};
content = compress::truncate_preserving_refs(&content, &refs_block, max);
}
FetchResult {
token_estimate: compress::estimate_tokens(&content),
title,
final_url: source_url.to_string(),
content,
content_type: output_type,
media: media.label(),
references,
metadata,
source: source_url.to_string(),
}
}
fn strip_duplicate_title(title: &str, content: String) -> String {
if title.is_empty() {
return content;
}
let mut parts = content.splitn(2, '\n');
let first = parts.next().unwrap_or("");
if compress::compress_text(first) == compress::compress_text(title) {
return parts
.next()
.unwrap_or("")
.trim_start_matches('\n')
.to_string();
}
content
}
pub async fn fetch_and_convert(options: FetchOptions) -> anyhow::Result<FetchResult> {
let page = fetch::fetch_page(&options.url, options.timeout_secs).await?;
let mut result = convert_body(
&page.body,
&page.final_url,
page.content_type.as_deref(),
&options,
);
result.final_url = page.final_url;
Ok(result)
}
pub fn parse_content_type(s: &str) -> ContentType {
ContentType::parse(s)
}