use webfetch::compress::{compress_text, estimate_tokens, truncate_to_tokens};
use webfetch::convert::convert;
use webfetch::convert::text::{html_to_text_with_refs, render_references};
use webfetch::media::{classify, Media};
use webfetch::types::{ContentType, FetchOptions};
use webfetch::{convert_body, convert_html};
const DOCS: &str = include_str!("fixtures/docs.html");
const BLOG: &str = include_str!("fixtures/blog.html");
const SPA: &str = include_str!("fixtures/spa-shell.html");
#[test]
fn test_compress_collapses_whitespace() {
let output = compress_text("hello world\n\n\n test");
assert_eq!(output, "hello world test");
}
#[test]
fn test_compress_removes_decorative() {
let output = compress_text("Click ▶ to play");
assert_eq!(output, "Click to play");
}
#[test]
fn test_truncate_to_tokens() {
let text = "a".repeat(100);
let out = truncate_to_tokens(&text, 5); assert!(out.starts_with(&"a".repeat(20)));
assert!(out.contains("truncated"));
assert!(estimate_tokens(&text) == 25);
}
#[test]
fn test_links_become_inline_references() {
let base = "https://docs.example.com/page";
let (text, refs) = html_to_text_with_refs(DOCS, base);
assert!(text.contains("users endpoint [1]"), "text was: {text}");
assert!(text.contains("OAuth2 [2]"), "text was: {text}");
assert_eq!(refs[0].url, "https://docs.example.com/api/v2/users");
assert_eq!(refs[1].url, "https://auth.example.com/oauth2");
}
#[test]
fn test_duplicate_urls_share_one_reference() {
let base = "https://docs.example.com/page";
let (text, refs) = html_to_text_with_refs(DOCS, base);
let occurrences = text.matches("[1]").count();
assert_eq!(occurrences, 2, "text was: {text}");
assert_eq!(refs.len(), 3, "refs: {refs:?}");
assert_eq!(refs[2].url, "https://docs.example.com/guide");
}
#[test]
fn test_references_block_rendering() {
let refs = vec![
webfetch::types::UrlReference {
index: 1,
url: "https://a.test/x".into(),
text: "x".into(),
},
webfetch::types::UrlReference {
index: 2,
url: "https://b.test/y".into(),
text: "y".into(),
},
];
let block = render_references(&refs);
assert_eq!(
block,
"References:\n[1] https://a.test/x\n[2] https://b.test/y"
);
}
#[test]
fn test_text_output_appends_reference_block() {
let converted = convert(BLOG, "https://blog.example.com/post", ContentType::Text);
assert!(converted.content.contains("references page [1]"));
assert!(converted.content.contains("References:"));
assert!(converted
.content
.contains("[1] https://blog.example.com/refs"));
assert!(converted.content.contains("on our references page"));
}
#[test]
fn test_skippable_elements_excluded() {
let (text, _) = html_to_text_with_refs(DOCS, "https://docs.example.com/");
assert!(!text.contains("ignore me"));
}
#[test]
fn test_markdown_keeps_links_inline() {
let converted = convert(BLOG, "https://blog.example.com/post", ContentType::Markdown);
assert!(converted
.content
.contains("[references page](https://blog.example.com/refs)"));
assert!(converted.content.contains("# Why References Matter"));
}
#[test]
fn test_structured_emits_json_with_references() {
let converted = convert(
DOCS,
"https://docs.example.com/page",
ContentType::Structured,
);
let v: serde_json::Value = serde_json::from_str(&converted.content).unwrap();
assert!(v["blocks"].is_array());
assert!(v["references"].is_array());
assert_eq!(v["references"].as_array().unwrap().len(), 3);
}
#[test]
fn test_spa_shell_yields_empty_body() {
let converted = convert(SPA, "https://spa.example.com/", ContentType::Text);
assert!(converted.references.is_empty());
assert!(converted.content.trim().is_empty());
}
#[test]
fn test_classify_by_header() {
assert_eq!(classify(Some("text/html; charset=utf-8"), ""), Media::Html);
assert_eq!(classify(Some("application/json"), ""), Media::Json);
assert_eq!(classify(Some("text/plain"), ""), Media::Text);
assert_eq!(
classify(Some("image/png"), ""),
Media::Other("image/png".into())
);
}
#[test]
fn test_classify_by_sniff_when_no_header() {
assert_eq!(
classify(None, " <html><body>hi</body></html>"),
Media::Html
);
assert_eq!(classify(None, " {\"a\": 1}"), Media::Json);
assert_eq!(classify(None, "just words"), Media::Text);
assert_eq!(classify(None, "{not json"), Media::Text);
}
#[test]
fn test_json_passthrough_is_pretty_printed() {
let opts = FetchOptions::default();
let r = convert_body(
"{\"a\":1,\"b\":[2,3]}",
"https://api.test/x",
Some("application/json"),
&opts,
);
assert_eq!(r.media, "json");
assert!(r.references.is_empty());
assert!(r.content.contains("\"a\": 1"), "content: {}", r.content);
}
#[test]
fn test_text_passthrough_is_verbatim() {
let opts = FetchOptions::default();
let r = convert_body(
"# Title\n\nsome *markdown*",
"https://x.test/readme.md",
Some("text/markdown"),
&opts,
);
assert_eq!(r.media, "text");
assert_eq!(r.content, "# Title\n\nsome *markdown*");
}
#[test]
fn test_binary_media_is_summarized_not_rendered() {
let opts = FetchOptions::default();
let r = convert_body(
"\u{0089}PNGblob",
"https://x.test/a.png",
Some("image/png"),
&opts,
);
assert_eq!(r.media, "image/png");
assert!(r.content.contains("not rendered"), "content: {}", r.content);
}
#[test]
fn test_html_path_still_extracts_refs_and_media() {
let opts = FetchOptions::default();
let r = convert_body(
DOCS,
"https://docs.example.com/page",
Some("text/html"),
&opts,
);
assert_eq!(r.media, "html");
assert_eq!(r.references.len(), 3);
assert!(r.content.contains("users endpoint [1]"));
}
#[test]
fn test_metadata_extraction() {
let html = r#"<!DOCTYPE html><html lang="en">
<head>
<title>Meta Test</title>
<meta name="description" content="A short summary.">
<meta name="author" content="Ada Lovelace">
<meta property="article:published_time" content="2024-12-01">
<meta property="og:site_name" content="Example Docs">
</head>
<body><article><p>Body.</p></article></body></html>"#;
let r = convert_html(html, "https://example.com/p", &FetchOptions::default());
assert_eq!(r.title, "Meta Test");
assert_eq!(r.metadata.description.as_deref(), Some("A short summary."));
assert_eq!(r.metadata.author.as_deref(), Some("Ada Lovelace"));
assert_eq!(r.metadata.published.as_deref(), Some("2024-12-01"));
assert_eq!(r.metadata.site_name.as_deref(), Some("Example Docs"));
assert_eq!(r.metadata.lang.as_deref(), Some("en"));
}