Skip to main content

lychee_lib/extract/
xml.rs

1//! Extract links from XML documents. Currently supports sitemaps, RSS and Atom feeds.
2use log::warn;
3use quick_xml::Reader;
4use quick_xml::events::Event;
5
6use crate::types::uri::raw::{RawUri, SpanProvider};
7
8/// Extract unparsed URL strings from common XML formats, like sitemap.xml, RSS feeds, or Atom feeds.
9pub(crate) fn extract_xml<S: SpanProvider>(input: &str, span_provider: &S) -> Vec<RawUri> {
10    let mut reader = Reader::from_str(input);
11
12    let mut uris: Vec<RawUri> = Vec::new();
13
14    loop {
15        match reader.read_event().unwrap() {
16            Event::Start(e) => match e.name().as_ref() {
17                b"loc" /* sitemap */ | b"link" /* RSS */ => {
18                    let start_of_text_offset: usize = reader.buffer_position().try_into().unwrap_or_default();
19                    let element = String::from_utf8(e.name().as_ref().to_vec()).unwrap_or_default();
20                    let text = reader.read_text(e.name()).unwrap_or_default().as_ref().to_string();
21                    let span = span_provider.span(start_of_text_offset);
22
23                    if !text.is_empty() && !element.is_empty() {
24                        uris.push(RawUri {
25                            text,
26                            element: Some(element),
27                            attribute: None,
28                            span
29                        });
30                    }
31                },
32                _ => {}
33            },
34            Event::Empty(e) if e.name().as_ref() == b"link" => {
35                for attr in e.attributes().flatten() {
36                    if attr.key.as_ref() == b"href" {
37                        let text = std::str::from_utf8(attr.value.as_ref())
38                            .unwrap_or("")
39                            .to_string();
40                        let element = std::str::from_utf8(e.name().as_ref())
41                            .unwrap_or("")
42                            .to_string();
43                        let end_of_empty_tag: usize =
44                            reader.buffer_position().try_into().unwrap_or_default();
45                        // Span is a bit imprecise, as it points to the end of the element. However, quick_xml does not provide the position of attributes, so this is the best we can do.
46                        let span = span_provider.span(end_of_empty_tag);
47
48                        if !text.is_empty() && !element.is_empty() {
49                            uris.push(RawUri {
50                                text,
51                                element: Some(element),
52                                attribute: Some("href".to_string()),
53                                span,
54                            });
55                        }
56                    }
57                }
58            }
59            Event::Eof => break,
60            _ => {}
61        }
62    }
63
64    if uris.is_empty() {
65        warn!(
66            "No URLs found in XML input. Currently, lychee only supports extracting URLs from sitemaps, RSS and Atom feeds. If your XML contains links in a different format, please consider submitting a feature request or contributing support for additional XML formats."
67        );
68    }
69
70    uris
71}
72
73#[cfg(test)]
74mod tests {
75    use crate::types::uri::raw::{SourceSpanProvider, span};
76
77    use super::*;
78
79    fn extract(input: &str) -> Vec<RawUri> {
80        extract_xml(input, &SourceSpanProvider::from_input(input))
81    }
82
83    #[test]
84    fn test_extract_sitemap_links() {
85        // Sitemap example generated by mkdocs
86        let input = r#"<?xml version="1.0" encoding="UTF-8"?>
87<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
88    <url>
89         <loc>https://elastisys.io/welkin/</loc>
90         <lastmod>2026-03-04</lastmod>
91    </url>
92    <url>
93         <loc>https://elastisys.io/welkin/architecture/</loc>
94         <lastmod>2026-03-04</lastmod>
95    </url>
96    <url>
97         <loc>https://elastisys.io/welkin/glossary/</loc>
98         <lastmod>2026-03-04</lastmod>
99    </url>
100</urlset>"#;
101
102        let expected = vec![
103            RawUri {
104                text: "https://elastisys.io/welkin/".to_string(),
105                element: Some("loc".to_string()),
106                attribute: None,
107                span: span(4, 15),
108            },
109            RawUri {
110                text: "https://elastisys.io/welkin/architecture/".to_string(),
111                element: Some("loc".to_string()),
112                attribute: None,
113                span: span(8, 15),
114            },
115            RawUri {
116                text: "https://elastisys.io/welkin/glossary/".to_string(),
117                element: Some("loc".to_string()),
118                attribute: None,
119                span: span(12, 15),
120            },
121        ];
122
123        let uris = extract(input);
124
125        assert_eq!(uris, expected);
126    }
127
128    #[test]
129    fn test_extract_rss_links() {
130        // RSS example
131        let input = r#"<?xml version="1.0" encoding="UTF-8"?>
132<rss version="2.0">
133    <channel>
134        <title>Example Feed</title>
135        <link>https://example.com</link>
136        <description>Example RSS Feed</description>
137        <item>
138            <title>Example Item</title>
139            <link>https://example.com/item</link>
140            <description>Example Item Description</description>
141        </item>
142    </channel>
143</rss>"#;
144
145        let expected = vec![
146            RawUri {
147                text: "https://example.com".to_string(),
148                element: Some("link".to_string()),
149                attribute: None,
150                span: span(5, 15),
151            },
152            RawUri {
153                text: "https://example.com/item".to_string(),
154                element: Some("link".to_string()),
155                attribute: None,
156                span: span(9, 19),
157            },
158        ];
159
160        let uris = extract(input);
161
162        assert_eq!(uris, expected);
163    }
164
165    #[test]
166    fn test_extract_atom_links() {
167        // Atom example
168        let input = r#"<?xml version="1.0" encoding="utf-8"?>
169<feed xmlns="http://www.w3.org/2005/Atom">
170    <title>Example Feed</title>
171    <link href="https://example.com" />
172    <updated>2026-03-04T12:00:00Z</updated>
173    <author>
174        <name>John Doe</name>
175    </author>
176    <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
177    <entry>
178        <title>Example Entry</title>
179        <link href="https://example.com/entry" />
180        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
181        <updated>2026-03-04T12:00:00Z</updated>
182        <summary>Example Entry Summary</summary>
183    </entry>
184</feed>"#;
185
186        let expected = vec![
187            RawUri {
188                text: "https://example.com".to_string(),
189                element: Some("link".to_string()),
190                attribute: Some("href".to_string()),
191                span: span(4, 40),
192            },
193            RawUri {
194                text: "https://example.com/entry".to_string(),
195                element: Some("link".to_string()),
196                attribute: Some("href".to_string()),
197                span: span(12, 50),
198            },
199        ];
200
201        let uris = extract(input);
202
203        assert_eq!(uris, expected);
204    }
205}