lychee-lib 0.24.1

A fast, async link checker
Documentation
//! Extract links from XML documents. Currently supports sitemaps, RSS and Atom feeds.
use log::warn;
use quick_xml::Reader;
use quick_xml::events::Event;

use crate::types::uri::raw::{RawUri, SpanProvider};

/// Extract unparsed URL strings from common XML formats, like sitemap.xml, RSS feeds, or Atom feeds.
pub(crate) fn extract_xml<S: SpanProvider>(input: &str, span_provider: &S) -> Vec<RawUri> {
    let mut reader = Reader::from_str(input);

    let mut uris: Vec<RawUri> = Vec::new();

    loop {
        match reader.read_event().unwrap() {
            Event::Start(e) => match e.name().as_ref() {
                b"loc" /* sitemap */ | b"link" /* RSS */ => {
                    let start_of_text_offset: usize = reader.buffer_position().try_into().unwrap_or_default();
                    let element = String::from_utf8(e.name().as_ref().to_vec()).unwrap_or_default();
                    let text = reader.read_text(e.name()).unwrap_or_default().as_ref().to_string();
                    let span = span_provider.span(start_of_text_offset);

                    if !text.is_empty() && !element.is_empty() {
                        uris.push(RawUri {
                            text,
                            element: Some(element),
                            attribute: None,
                            span
                        });
                    }
                },
                _ => {}
            },
            Event::Empty(e) if e.name().as_ref() == b"link" => {
                for attr in e.attributes().flatten() {
                    if attr.key.as_ref() == b"href" {
                        let text = std::str::from_utf8(attr.value.as_ref())
                            .unwrap_or("")
                            .to_string();
                        let element = std::str::from_utf8(e.name().as_ref())
                            .unwrap_or("")
                            .to_string();
                        let end_of_empty_tag: usize =
                            reader.buffer_position().try_into().unwrap_or_default();
                        // Span is a bit imprecise, as it points to the end of the element. However, quick_xml does not provide the position of attributes, so this is the best we can do.
                        let span = span_provider.span(end_of_empty_tag);

                        if !text.is_empty() && !element.is_empty() {
                            uris.push(RawUri {
                                text,
                                element: Some(element),
                                attribute: Some("href".to_string()),
                                span,
                            });
                        }
                    }
                }
            }
            Event::Eof => break,
            _ => {}
        }
    }

    if uris.is_empty() {
        warn!(
            "No URLs found in XML input. Currently, lychee only supports extracting URLs from sitemaps, RSS and Atom feeds. If your XML contains links in a different format, please consider submitting a feature request or contributing support for additional XML formats."
        );
    }

    uris
}

#[cfg(test)]
mod tests {
    use crate::types::uri::raw::{SourceSpanProvider, span};

    use super::*;

    fn extract(input: &str) -> Vec<RawUri> {
        extract_xml(input, &SourceSpanProvider::from_input(input))
    }

    #[test]
    fn test_extract_sitemap_links() {
        // Sitemap example generated by mkdocs
        let input = r#"<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    <url>
         <loc>https://elastisys.io/welkin/</loc>
         <lastmod>2026-03-04</lastmod>
    </url>
    <url>
         <loc>https://elastisys.io/welkin/architecture/</loc>
         <lastmod>2026-03-04</lastmod>
    </url>
    <url>
         <loc>https://elastisys.io/welkin/glossary/</loc>
         <lastmod>2026-03-04</lastmod>
    </url>
</urlset>"#;

        let expected = vec![
            RawUri {
                text: "https://elastisys.io/welkin/".to_string(),
                element: Some("loc".to_string()),
                attribute: None,
                span: span(4, 15),
            },
            RawUri {
                text: "https://elastisys.io/welkin/architecture/".to_string(),
                element: Some("loc".to_string()),
                attribute: None,
                span: span(8, 15),
            },
            RawUri {
                text: "https://elastisys.io/welkin/glossary/".to_string(),
                element: Some("loc".to_string()),
                attribute: None,
                span: span(12, 15),
            },
        ];

        let uris = extract(input);

        assert_eq!(uris, expected);
    }

    #[test]
    fn test_extract_rss_links() {
        // RSS example
        let input = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
    <channel>
        <title>Example Feed</title>
        <link>https://example.com</link>
        <description>Example RSS Feed</description>
        <item>
            <title>Example Item</title>
            <link>https://example.com/item</link>
            <description>Example Item Description</description>
        </item>
    </channel>
</rss>"#;

        let expected = vec![
            RawUri {
                text: "https://example.com".to_string(),
                element: Some("link".to_string()),
                attribute: None,
                span: span(5, 15),
            },
            RawUri {
                text: "https://example.com/item".to_string(),
                element: Some("link".to_string()),
                attribute: None,
                span: span(9, 19),
            },
        ];

        let uris = extract(input);

        assert_eq!(uris, expected);
    }

    #[test]
    fn test_extract_atom_links() {
        // Atom example
        let input = r#"<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <title>Example Feed</title>
    <link href="https://example.com" />
    <updated>2026-03-04T12:00:00Z</updated>
    <author>
        <name>John Doe</name>
    </author>
    <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
    <entry>
        <title>Example Entry</title>
        <link href="https://example.com/entry" />
        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
        <updated>2026-03-04T12:00:00Z</updated>
        <summary>Example Entry Summary</summary>
    </entry>
</feed>"#;

        let expected = vec![
            RawUri {
                text: "https://example.com".to_string(),
                element: Some("link".to_string()),
                attribute: Some("href".to_string()),
                span: span(4, 40),
            },
            RawUri {
                text: "https://example.com/entry".to_string(),
                element: Some("link".to_string()),
                attribute: Some("href".to_string()),
                span: span(12, 50),
            },
        ];

        let uris = extract(input);

        assert_eq!(uris, expected);
    }
}