use log::warn;
use quick_xml::Reader;
use quick_xml::events::Event;
use crate::types::uri::raw::{RawUri, SpanProvider};
pub(crate) fn extract_xml<S: SpanProvider>(input: &str, span_provider: &S) -> Vec<RawUri> {
let mut reader = Reader::from_str(input);
let mut uris: Vec<RawUri> = Vec::new();
loop {
match reader.read_event().unwrap() {
Event::Start(e) => match e.name().as_ref() {
b"loc" | b"link" => {
let start_of_text_offset: usize = reader.buffer_position().try_into().unwrap_or_default();
let element = String::from_utf8(e.name().as_ref().to_vec()).unwrap_or_default();
let text = reader.read_text(e.name()).unwrap_or_default().as_ref().to_string();
let span = span_provider.span(start_of_text_offset);
if !text.is_empty() && !element.is_empty() {
uris.push(RawUri {
text,
element: Some(element),
attribute: None,
span
});
}
},
_ => {}
},
Event::Empty(e) if e.name().as_ref() == b"link" => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"href" {
let text = std::str::from_utf8(attr.value.as_ref())
.unwrap_or("")
.to_string();
let element = std::str::from_utf8(e.name().as_ref())
.unwrap_or("")
.to_string();
let end_of_empty_tag: usize =
reader.buffer_position().try_into().unwrap_or_default();
let span = span_provider.span(end_of_empty_tag);
if !text.is_empty() && !element.is_empty() {
uris.push(RawUri {
text,
element: Some(element),
attribute: Some("href".to_string()),
span,
});
}
}
}
}
Event::Eof => break,
_ => {}
}
}
if uris.is_empty() {
warn!(
"No URLs found in XML input. Currently, lychee only supports extracting URLs from sitemaps, RSS and Atom feeds. If your XML contains links in a different format, please consider submitting a feature request or contributing support for additional XML formats."
);
}
uris
}
#[cfg(test)]
mod tests {
use crate::types::uri::raw::{SourceSpanProvider, span};
use super::*;
fn extract(input: &str) -> Vec<RawUri> {
extract_xml(input, &SourceSpanProvider::from_input(input))
}
#[test]
fn test_extract_sitemap_links() {
let input = r#"<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://elastisys.io/welkin/</loc>
<lastmod>2026-03-04</lastmod>
</url>
<url>
<loc>https://elastisys.io/welkin/architecture/</loc>
<lastmod>2026-03-04</lastmod>
</url>
<url>
<loc>https://elastisys.io/welkin/glossary/</loc>
<lastmod>2026-03-04</lastmod>
</url>
</urlset>"#;
let expected = vec![
RawUri {
text: "https://elastisys.io/welkin/".to_string(),
element: Some("loc".to_string()),
attribute: None,
span: span(4, 15),
},
RawUri {
text: "https://elastisys.io/welkin/architecture/".to_string(),
element: Some("loc".to_string()),
attribute: None,
span: span(8, 15),
},
RawUri {
text: "https://elastisys.io/welkin/glossary/".to_string(),
element: Some("loc".to_string()),
attribute: None,
span: span(12, 15),
},
];
let uris = extract(input);
assert_eq!(uris, expected);
}
#[test]
fn test_extract_rss_links() {
let input = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Example Feed</title>
<link>https://example.com</link>
<description>Example RSS Feed</description>
<item>
<title>Example Item</title>
<link>https://example.com/item</link>
<description>Example Item Description</description>
</item>
</channel>
</rss>"#;
let expected = vec![
RawUri {
text: "https://example.com".to_string(),
element: Some("link".to_string()),
attribute: None,
span: span(5, 15),
},
RawUri {
text: "https://example.com/item".to_string(),
element: Some("link".to_string()),
attribute: None,
span: span(9, 19),
},
];
let uris = extract(input);
assert_eq!(uris, expected);
}
#[test]
fn test_extract_atom_links() {
let input = r#"<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="https://example.com" />
<updated>2026-03-04T12:00:00Z</updated>
<author>
<name>John Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<entry>
<title>Example Entry</title>
<link href="https://example.com/entry" />
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2026-03-04T12:00:00Z</updated>
<summary>Example Entry Summary</summary>
</entry>
</feed>"#;
let expected = vec![
RawUri {
text: "https://example.com".to_string(),
element: Some("link".to_string()),
attribute: Some("href".to_string()),
span: span(4, 40),
},
RawUri {
text: "https://example.com/entry".to_string(),
element: Some("link".to_string()),
attribute: Some("href".to_string()),
span: span(12, 50),
},
];
let uris = extract(input);
assert_eq!(uris, expected);
}
}