use quick_xml::Reader;
use quick_xml::events::Event;
use url::Url;
use crate::http::http_fetch;
use crate::normalize::{resolve_redirect, rewrite_url_host};
use crate::types::{CrawlConfig, SitemapUrl};
pub(crate) fn parse_sitemap_xml(body: &str) -> Vec<SitemapUrl> {
let mut urls = Vec::new();
let mut reader = Reader::from_str(body);
let mut buf = Vec::new();
let mut in_url = false;
let mut in_loc = false;
let mut in_lastmod = false;
let mut in_changefreq = false;
let mut in_priority = false;
let mut current_loc = String::new();
let mut current_lastmod: Option<String> = None;
let mut current_changefreq: Option<String> = None;
let mut current_priority: Option<String> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => match e.name().as_ref() {
b"url" => {
in_url = true;
current_loc.clear();
current_lastmod = None;
current_changefreq = None;
current_priority = None;
}
b"loc" if in_url => in_loc = true,
b"lastmod" if in_url => in_lastmod = true,
b"changefreq" if in_url => in_changefreq = true,
b"priority" if in_url => in_priority = true,
_ => {}
},
Ok(Event::End(ref e)) => match e.name().as_ref() {
b"url" => {
if in_url && !current_loc.is_empty() {
urls.push(SitemapUrl {
url: current_loc.clone(),
lastmod: current_lastmod.clone(),
changefreq: current_changefreq.clone(),
priority: current_priority.clone(),
});
}
in_url = false;
}
b"loc" => in_loc = false,
b"lastmod" => in_lastmod = false,
b"changefreq" => in_changefreq = false,
b"priority" => in_priority = false,
_ => {}
},
Ok(Event::Text(ref e)) => {
if let Ok(text) = e.xml_content() {
let text = text.trim().to_owned();
if in_loc {
current_loc = text;
} else if in_lastmod {
current_lastmod = Some(text);
} else if in_changefreq {
current_changefreq = Some(text);
} else if in_priority {
current_priority = Some(text);
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
urls
}
pub(crate) fn parse_sitemap_index(body: &str) -> Vec<String> {
let mut child_urls = Vec::new();
let mut reader = Reader::from_str(body);
let mut buf = Vec::new();
let mut in_sitemap = false;
let mut in_loc = false;
let mut current_loc = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => match e.name().as_ref() {
b"sitemap" => {
in_sitemap = true;
current_loc.clear();
}
b"loc" if in_sitemap => in_loc = true,
_ => {}
},
Ok(Event::End(ref e)) => match e.name().as_ref() {
b"sitemap" => {
if in_sitemap && !current_loc.is_empty() {
child_urls.push(current_loc.clone());
}
in_sitemap = false;
}
b"loc" => in_loc = false,
_ => {}
},
Ok(Event::Text(ref e)) => {
if in_loc && let Ok(text) = e.xml_content() {
current_loc = text.trim().to_owned();
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
child_urls
}
pub(crate) fn is_sitemap_index(body: &str) -> bool {
body.contains("<sitemapindex") || body.contains("<sitemapindex>")
}
pub(crate) async fn fetch_sitemap_tree(
sitemap_url: &str,
config: &CrawlConfig,
client: &reqwest::Client,
) -> Vec<SitemapUrl> {
let resp = match http_fetch(sitemap_url, config, &std::collections::HashMap::new(), client).await {
Ok(r) => r,
Err(_) => return Vec::new(),
};
process_sitemap_response(
sitemap_url,
&resp.body,
&resp.body_bytes,
&resp.content_type,
config,
client,
)
.await
}
pub(crate) async fn process_sitemap_response(
sitemap_url: &str,
body: &str,
body_bytes: &[u8],
content_type: &str,
config: &CrawlConfig,
client: &reqwest::Client,
) -> Vec<SitemapUrl> {
let decompressed;
let xml_body = if content_type.contains("gzip") || content_type.contains("x-gzip") {
match decompress_gzip(body_bytes) {
Ok(d) => {
decompressed = d;
&decompressed
}
Err(_) => body,
}
} else {
body
};
if is_sitemap_index(xml_body) {
let child_urls = parse_sitemap_index(xml_body);
let base = Url::parse(sitemap_url).ok();
let mut all_urls = Vec::new();
let max_children = 100;
for child_url in child_urls.iter().take(max_children) {
let resolved = if let Some(ref base_parsed) = base {
if Url::parse(child_url).is_ok() {
rewrite_url_host(child_url, base_parsed)
} else {
resolve_redirect(sitemap_url, child_url)
}
} else {
child_url.clone()
};
let child_resp = match http_fetch(&resolved, config, &std::collections::HashMap::new(), client).await {
Ok(r) => r,
Err(_) => continue,
};
let child_body = &child_resp.body;
let child_decompressed;
let child_xml = if child_resp.content_type.contains("gzip") {
match decompress_gzip(&child_resp.body_bytes) {
Ok(d) => {
child_decompressed = d;
&child_decompressed
}
Err(_) => child_body,
}
} else {
child_body
};
all_urls.extend(parse_sitemap_xml(child_xml));
}
all_urls
} else {
parse_sitemap_xml(xml_body)
}
}
pub(crate) fn decompress_gzip(data: &[u8]) -> Result<String, std::io::Error> {
use flate2::read::GzDecoder;
use std::io::Read;
const MAX_DECOMPRESSED_SIZE: u64 = 50 * 1024 * 1024;
let decoder = GzDecoder::new(data);
let mut limited = decoder.take(MAX_DECOMPRESSED_SIZE);
let mut result = String::new();
limited.read_to_string(&mut result)?;
Ok(result)
}