use crate::brain::tools::web_scrape::sitemap::{extract_locs, looks_like_sitemap};
use url::Url;
#[test]
fn extracts_all_page_locs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/a</loc></url>
<url><loc>https://example.com/b</loc></url>
<url><loc>https://example.com/c</loc></url>
</urlset>"#;
let locs = extract_locs(xml, None);
assert_eq!(locs.len(), 3);
assert!(locs.contains(&"https://example.com/a".to_string()));
assert!(locs.contains(&"https://example.com/c".to_string()));
}
#[test]
fn decodes_amp_entities_in_query_strings() {
let xml = r#"<urlset><url><loc>https://example.com/p?a=1&b=2</loc></url></urlset>"#;
let locs = extract_locs(xml, None);
assert_eq!(locs, vec!["https://example.com/p?a=1&b=2".to_string()]);
}
#[test]
fn resolves_relative_locs_against_base() {
let base = Url::parse("https://example.com/sitemap.xml").unwrap();
let xml = r#"<urlset><url><loc>/blog/post-1</loc></url></urlset>"#;
let locs = extract_locs(xml, Some(&base));
assert_eq!(locs, vec!["https://example.com/blog/post-1".to_string()]);
}
#[test]
fn drops_relative_locs_without_a_base() {
let xml = r#"<urlset><url><loc>/blog/post-1</loc></url></urlset>"#;
let locs = extract_locs(xml, None);
assert!(locs.is_empty());
}
#[test]
fn extracts_nested_sitemap_locs_from_index() {
let xml = r#"<sitemapindex>
<sitemap><loc>https://example.com/sitemap-posts.xml</loc></sitemap>
<sitemap><loc>https://example.com/sitemap-pages.xml</loc></sitemap>
</sitemapindex>"#;
let locs = extract_locs(xml, None);
assert_eq!(locs.len(), 2);
assert!(locs.contains(&"https://example.com/sitemap-posts.xml".to_string()));
}
#[test]
fn recognizes_urlset_and_index_documents() {
assert!(looks_like_sitemap(
r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">"#
));
assert!(looks_like_sitemap("<sitemapindex>"));
assert!(!looks_like_sitemap(
"<html><body>not a sitemap</body></html>"
));
}