use super::extract::apply_extract;
const BLOG_HTML: &str = r#"<html><head><title>Blog Post</title>
<meta name="description" content="A great blog post">
<meta property="og:title" content="OG Blog Title">
<meta property="og:image" content="https://example.com/og.jpg">
<meta name="twitter:card" content="summary_large_image">
<link rel="canonical" href="https://example.com/blog/post">
<script type="application/ld+json">{"@type":"Article","headline":"JSON-LD Title"}</script>
</head><body>
<nav><a href="/">Home</a><a href="/about">About</a></nav>
<article><h1>Blog Post Title</h1><p>This is the <strong>main content</strong> with a <a href="/related">related link</a>.</p>
<ul><li>Item 1</li><li>Item 2</li></ul></article>
<footer><a href="https://twitter.com/example" rel="nofollow">Twitter</a></footer>
</body></html>"#;
const JSON_API: &str =
r#"{"data":{"items":[{"name":"Alpha","score":95},{"name":"Beta","score":42}],"total":2}}"#;
const RSS_FEED: &str = r#"<?xml version="1.0"?><rss version="2.0"><channel>
<title>Test Feed</title><item><title>Entry 1</title><link>https://example.com/1</link>
<pubDate>Mon, 20 Mar 2026 00:00:00 GMT</pubDate></item></channel></rss>"#;
#[cfg(feature = "fetch-markdown")]
mod markdown {
use super::*;
#[test]
fn extract_markdown_produces_headings() {
let result = apply_extract(BLOG_HTML, Some("markdown"), None).unwrap();
assert!(
result.contains("# Blog Post Title"),
"Expected '# Blog Post Title' in:\n{result}"
);
}
#[test]
fn extract_markdown_preserves_bold() {
let result = apply_extract(BLOG_HTML, Some("markdown"), None).unwrap();
assert!(
result.contains("**main content**"),
"Expected '**main content**' in:\n{result}"
);
}
#[test]
fn extract_markdown_produces_links() {
let result = apply_extract(BLOG_HTML, Some("markdown"), None).unwrap();
assert!(
result.contains("[related link](/related)"),
"Expected '[related link](/related)' in:\n{result}"
);
}
#[test]
fn extract_markdown_includes_full_document() {
let result = apply_extract(BLOG_HTML, Some("markdown"), None).unwrap();
assert!(
result.contains("Blog Post Title"),
"Expected article heading in markdown output"
);
}
#[test]
fn extract_markdown_preserves_list_items() {
let result = apply_extract(BLOG_HTML, Some("markdown"), None).unwrap();
assert!(
result.contains("Item 1"),
"Expected 'Item 1' in markdown output"
);
assert!(
result.contains("Item 2"),
"Expected 'Item 2' in markdown output"
);
}
#[test]
fn extract_markdown_empty_html() {
let result = apply_extract("", Some("markdown"), None).unwrap();
assert!(
result.trim().is_empty(),
"Expected empty markdown for empty input"
);
}
}
#[cfg(feature = "fetch-html")]
mod text {
use super::*;
#[test]
fn extract_text_all() {
let result = apply_extract(BLOG_HTML, Some("text"), None).unwrap();
assert!(
result.contains("Blog Post Title"),
"Expected 'Blog Post Title' in:\n{result}"
);
assert!(
result.contains("main content"),
"Expected 'main content' in:\n{result}"
);
}
#[test]
fn extract_text_with_selector_h1() {
let result = apply_extract(BLOG_HTML, Some("text"), Some("article h1")).unwrap();
assert!(
result.contains("Blog Post Title"),
"Expected 'Blog Post Title' in:\n{result}"
);
}
#[test]
fn extract_text_with_selector_p() {
let result = apply_extract(BLOG_HTML, Some("text"), Some("article p")).unwrap();
assert!(
result.contains("main content"),
"Expected 'main content' in:\n{result}"
);
}
#[test]
fn extract_text_no_html_tags() {
let result = apply_extract(BLOG_HTML, Some("text"), None).unwrap();
assert!(!result.contains("<h1>"), "Should not contain raw <h1> tags");
assert!(!result.contains("<p>"), "Should not contain raw <p> tags");
assert!(
!result.contains("</article>"),
"Should not contain closing tags"
);
}
#[test]
fn extract_text_selector_returns_only_matched() {
let html =
r#"<html><body><div class="a">Alpha</div><div class="b">Beta</div></body></html>"#;
let result = apply_extract(html, Some("text"), Some("div.a")).unwrap();
assert!(result.contains("Alpha"));
assert!(!result.contains("Beta"));
}
#[test]
fn extract_text_empty_html() {
let result = apply_extract("", Some("text"), None).unwrap();
assert!(
result.trim().is_empty(),
"Expected empty text for empty input, got: '{result}'"
);
}
#[test]
fn extract_text_invalid_selector_returns_error() {
let result = apply_extract(BLOG_HTML, Some("text"), Some("[[[invalid"));
assert!(result.is_err(), "Invalid selector should produce an error");
}
}
#[cfg(feature = "fetch-html")]
mod metadata {
use super::*;
fn parse_metadata(html: &str) -> serde_json::Value {
let result = apply_extract(html, Some("metadata"), None).unwrap();
serde_json::from_str(&result).expect("metadata should be valid JSON")
}
#[test]
fn extract_metadata_title() {
let meta = parse_metadata(BLOG_HTML);
assert_eq!(meta["title"], "Blog Post");
}
#[test]
fn extract_metadata_description() {
let meta = parse_metadata(BLOG_HTML);
assert_eq!(meta["description"], "A great blog post");
}
#[test]
fn extract_metadata_og() {
let meta = parse_metadata(BLOG_HTML);
assert_eq!(meta["og"]["title"], "OG Blog Title");
assert_eq!(meta["og"]["image"], "https://example.com/og.jpg");
}
#[test]
fn extract_metadata_twitter() {
let meta = parse_metadata(BLOG_HTML);
assert_eq!(meta["twitter"]["card"], "summary_large_image");
}
#[test]
fn extract_metadata_jsonld() {
let meta = parse_metadata(BLOG_HTML);
let json_ld = meta["json_ld"]
.as_array()
.expect("json_ld should be an array");
assert!(
!json_ld.is_empty(),
"json_ld should have at least one entry"
);
assert_eq!(json_ld[0]["@type"], "Article");
assert_eq!(json_ld[0]["headline"], "JSON-LD Title");
}
#[test]
fn extract_metadata_canonical() {
let meta = parse_metadata(BLOG_HTML);
assert_eq!(meta["canonical"], "https://example.com/blog/post");
}
#[test]
fn extract_metadata_missing_fields() {
let html = "<html><head></head><body></body></html>";
let meta = parse_metadata(html);
assert!(meta.get("title").is_none() || meta["title"].is_null());
assert!(meta.get("og").is_none() || meta["og"].is_null());
assert!(meta.get("twitter").is_none() || meta["twitter"].is_null());
}
#[test]
fn extract_metadata_returns_valid_json() {
let result = apply_extract(BLOG_HTML, Some("metadata"), None).unwrap();
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&result);
assert!(parsed.is_ok(), "metadata output must be valid JSON");
}
}
#[cfg(feature = "fetch-html")]
mod links {
use super::*;
fn parse_links(html: &str) -> serde_json::Value {
let result = apply_extract(html, Some("links"), None).unwrap();
serde_json::from_str(&result).expect("links should be valid JSON")
}
#[test]
fn extract_links_count() {
let parsed = parse_links(BLOG_HTML);
assert_eq!(parsed["count"], 4);
}
#[test]
fn extract_links_internal_urls() {
let parsed = parse_links(BLOG_HTML);
let links = parsed["links"].as_array().unwrap();
let urls: Vec<&str> = links.iter().map(|l| l["url"].as_str().unwrap()).collect();
assert!(urls.contains(&"/"), "Expected '/' in links");
assert!(urls.contains(&"/about"), "Expected '/about' in links");
assert!(urls.contains(&"/related"), "Expected '/related' in links");
}
#[test]
fn extract_links_external_urls() {
let parsed = parse_links(BLOG_HTML);
let links = parsed["links"].as_array().unwrap();
let urls: Vec<&str> = links.iter().map(|l| l["url"].as_str().unwrap()).collect();
assert!(
urls.iter().any(|u| u.contains("twitter.com")),
"Expected external twitter.com link"
);
}
#[test]
fn extract_links_anchor_text() {
let parsed = parse_links(BLOG_HTML);
let links = parsed["links"].as_array().unwrap();
let home_link = links.iter().find(|l| l["url"] == "/").unwrap();
assert_eq!(home_link["anchor"], "Home");
}
#[test]
fn extract_links_nofollow() {
let parsed = parse_links(BLOG_HTML);
let links = parsed["links"].as_array().unwrap();
let twitter_link = links
.iter()
.find(|l| l["url"].as_str().unwrap_or("").contains("twitter.com"))
.expect("twitter link should exist");
assert_eq!(twitter_link["rel"], "nofollow");
}
#[test]
fn extract_links_about_anchor() {
let parsed = parse_links(BLOG_HTML);
let links = parsed["links"].as_array().unwrap();
let about_link = links.iter().find(|l| l["url"] == "/about").unwrap();
assert_eq!(about_link["anchor"], "About");
}
#[test]
fn extract_links_related_anchor() {
let parsed = parse_links(BLOG_HTML);
let links = parsed["links"].as_array().unwrap();
let related_link = links.iter().find(|l| l["url"] == "/related").unwrap();
assert_eq!(related_link["anchor"], "related link");
}
#[test]
fn extract_links_empty_html() {
let parsed = parse_links("<html><body></body></html>");
assert_eq!(parsed["count"], 0);
assert!(parsed["links"].as_array().unwrap().is_empty());
}
}
mod jsonpath {
use super::*;
#[test]
fn extract_jsonpath_simple() {
let result = apply_extract(JSON_API, Some("jsonpath"), Some("$.data.total")).unwrap();
assert_eq!(result, "2");
}
#[test]
fn extract_jsonpath_array() {
let result =
apply_extract(JSON_API, Some("jsonpath"), Some("$.data.items[*].name")).unwrap();
let parsed: Vec<String> = serde_json::from_str(&result).unwrap();
assert_eq!(parsed, vec!["Alpha", "Beta"]);
}
#[test]
fn extract_jsonpath_nested() {
let result =
apply_extract(JSON_API, Some("jsonpath"), Some("$.data.items[0].score")).unwrap();
assert_eq!(result, "95");
}
#[test]
fn extract_jsonpath_second_item() {
let result =
apply_extract(JSON_API, Some("jsonpath"), Some("$.data.items[1].name")).unwrap();
assert_eq!(result, "\"Beta\"");
}
#[test]
fn extract_jsonpath_no_match() {
let result = apply_extract(JSON_API, Some("jsonpath"), Some("$.data.nonexistent")).unwrap();
assert_eq!(result, "null");
}
#[test]
fn extract_jsonpath_invalid_expression() {
let result = apply_extract(JSON_API, Some("jsonpath"), Some("$[invalid"));
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("Invalid JSONPath"));
}
#[test]
fn extract_jsonpath_invalid_json_body() {
let result = apply_extract("not json", Some("jsonpath"), Some("$.foo"));
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("not valid JSON"));
}
#[test]
fn extract_jsonpath_requires_selector() {
let result = apply_extract(JSON_API, Some("jsonpath"), None);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("jsonpath requires 'selector'"));
}
#[test]
fn extract_jsonpath_all_scores() {
let result =
apply_extract(JSON_API, Some("jsonpath"), Some("$.data.items[*].score")).unwrap();
let parsed: Vec<u64> = serde_json::from_str(&result).unwrap();
assert_eq!(parsed, vec![95, 42]);
}
}
#[cfg(feature = "fetch-feed")]
mod feed {
use super::*;
#[test]
fn extract_feed_rss() {
let result = apply_extract(RSS_FEED, Some("feed"), None).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(parsed["title"], "Test Feed");
assert!(parsed["entries"].as_array().is_some());
}
#[test]
fn extract_feed_entry_title() {
let result = apply_extract(RSS_FEED, Some("feed"), None).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
let entries = parsed["entries"].as_array().unwrap();
assert!(!entries.is_empty(), "Feed should have at least one entry");
assert_eq!(entries[0]["title"], "Entry 1");
}
#[test]
fn extract_feed_entry_url() {
let result = apply_extract(RSS_FEED, Some("feed"), None).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
let entries = parsed["entries"].as_array().unwrap();
assert_eq!(entries[0]["url"], "https://example.com/1");
}
#[test]
fn extract_feed_entry_count() {
let result = apply_extract(RSS_FEED, Some("feed"), None).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(parsed["entry_count"], 1);
}
}
#[cfg(feature = "fetch-article")]
mod article {
use super::*;
const ARTICLE_HTML: &str = r#"<html><head><title>Big Article</title></head><body>
<nav><a href="/">Home</a><a href="/about">About</a></nav>
<article>
<h1>Big Article Title</h1>
<p>This is the main body of the article. It needs to be long enough
for the readability algorithm to consider it as content. The algorithm
typically requires a minimum amount of text content to identify an
article region. So we add several sentences here to make sure the
extraction works properly. This is important for testing purposes.
We continue adding more content so the readability heuristics work.</p>
<p>Second paragraph with more content to help the readability score.
The more text we have here, the better the extraction will work.
Adding even more text for the readability parser. This ensures that
the algorithm correctly identifies the main content block.</p>
</article>
<footer>Footer content</footer>
</body></html>"#;
#[test]
fn extract_article_content() {
let result = apply_extract(ARTICLE_HTML, Some("article"), None).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
let text = parsed["text_content"].as_str().unwrap_or("");
assert!(
text.contains("main body"),
"Article text should contain article content"
);
}
#[test]
fn extract_article_strips_nav() {
let result = apply_extract(ARTICLE_HTML, Some("article"), None).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
let text = parsed["text_content"].as_str().unwrap_or("");
assert!(
!text.contains("Home"),
"Article text should not contain nav links"
);
}
#[test]
fn extract_article_returns_structured_json() {
let result = apply_extract(ARTICLE_HTML, Some("article"), None).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert!(parsed.get("title").is_some(), "Should have title field");
assert!(parsed.get("content").is_some(), "Should have content field");
assert!(
parsed.get("text_content").is_some(),
"Should have text_content field"
);
}
}
#[cfg(feature = "fetch-html")]
mod selector {
use super::*;
#[test]
fn extract_selector_html() {
let result = apply_extract(BLOG_HTML, Some("selector"), Some("article ul")).unwrap();
assert!(
result.contains("<li>"),
"Selector should return raw HTML with <li>"
);
assert!(result.contains("Item 1"), "Selector should return content");
}
#[test]
fn extract_selector_multiple_matches() {
let result = apply_extract(BLOG_HTML, Some("selector"), Some("li")).unwrap();
assert!(result.contains("Item 1"));
assert!(result.contains("Item 2"));
}
#[test]
fn extract_selector_returns_outer_html() {
let result = apply_extract(BLOG_HTML, Some("selector"), Some("article h1")).unwrap();
assert!(
result.contains("<h1>"),
"Selector should include the matched element's outer HTML"
);
assert!(result.contains("Blog Post Title"));
}
#[test]
fn extract_selector_requires_selector_field() {
let result = apply_extract(BLOG_HTML, Some("selector"), None);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("requires 'selector' field"));
}
#[test]
fn extract_selector_no_match_returns_empty() {
let result = apply_extract(BLOG_HTML, Some("selector"), Some("div.nonexistent")).unwrap();
assert!(result.is_empty(), "No match should return empty string");
}
}
mod edge_cases {
use super::*;
#[test]
fn extract_none_returns_raw() {
let result = apply_extract(BLOG_HTML, None, None).unwrap();
assert_eq!(result, BLOG_HTML, "No extract should return original body");
}
#[test]
fn extract_empty_html_none() {
let result = apply_extract("", None, None).unwrap();
assert_eq!(result, "", "Empty input with no extract returns empty");
}
#[test]
fn extract_invalid_mode() {
let result = apply_extract(BLOG_HTML, Some("bogus_mode"), None);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("Unknown extract mode"),
"Error should mention unknown mode"
);
assert!(
err.contains("bogus_mode"),
"Error should include the invalid mode name"
);
}
#[test]
fn extract_none_preserves_json() {
let result = apply_extract(JSON_API, None, None).unwrap();
assert_eq!(result, JSON_API, "JSON should be preserved verbatim");
}
#[test]
fn extract_none_preserves_xml() {
let result = apply_extract(RSS_FEED, None, None).unwrap();
assert_eq!(result, RSS_FEED, "XML should be preserved verbatim");
}
}