lychee_lib/extract/
xml.rs1use log::warn;
3use quick_xml::Reader;
4use quick_xml::events::Event;
5
6use crate::types::uri::raw::{RawUri, SpanProvider};
7
8pub(crate) fn extract_xml<S: SpanProvider>(input: &str, span_provider: &S) -> Vec<RawUri> {
10 let mut reader = Reader::from_str(input);
11
12 let mut uris: Vec<RawUri> = Vec::new();
13
14 loop {
15 match reader.read_event().unwrap() {
16 Event::Start(e) => match e.name().as_ref() {
17 b"loc" | b"link" => {
18 let start_of_text_offset: usize = reader.buffer_position().try_into().unwrap_or_default();
19 let element = String::from_utf8(e.name().as_ref().to_vec()).unwrap_or_default();
20 let text = reader.read_text(e.name()).unwrap_or_default().as_ref().to_string();
21 let span = span_provider.span(start_of_text_offset);
22
23 if !text.is_empty() && !element.is_empty() {
24 uris.push(RawUri {
25 text,
26 element: Some(element),
27 attribute: None,
28 span
29 });
30 }
31 },
32 _ => {}
33 },
34 Event::Empty(e) if e.name().as_ref() == b"link" => {
35 for attr in e.attributes().flatten() {
36 if attr.key.as_ref() == b"href" {
37 let text = std::str::from_utf8(attr.value.as_ref())
38 .unwrap_or("")
39 .to_string();
40 let element = std::str::from_utf8(e.name().as_ref())
41 .unwrap_or("")
42 .to_string();
43 let end_of_empty_tag: usize =
44 reader.buffer_position().try_into().unwrap_or_default();
45 let span = span_provider.span(end_of_empty_tag);
47
48 if !text.is_empty() && !element.is_empty() {
49 uris.push(RawUri {
50 text,
51 element: Some(element),
52 attribute: Some("href".to_string()),
53 span,
54 });
55 }
56 }
57 }
58 }
59 Event::Eof => break,
60 _ => {}
61 }
62 }
63
64 if uris.is_empty() {
65 warn!(
66 "No URLs found in XML input. Currently, lychee only supports extracting URLs from sitemaps, RSS and Atom feeds. If your XML contains links in a different format, please consider submitting a feature request or contributing support for additional XML formats."
67 );
68 }
69
70 uris
71}
72
73#[cfg(test)]
74mod tests {
75 use crate::types::uri::raw::{SourceSpanProvider, span};
76
77 use super::*;
78
79 fn extract(input: &str) -> Vec<RawUri> {
80 extract_xml(input, &SourceSpanProvider::from_input(input))
81 }
82
83 #[test]
84 fn test_extract_sitemap_links() {
85 let input = r#"<?xml version="1.0" encoding="UTF-8"?>
87<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
88 <url>
89 <loc>https://elastisys.io/welkin/</loc>
90 <lastmod>2026-03-04</lastmod>
91 </url>
92 <url>
93 <loc>https://elastisys.io/welkin/architecture/</loc>
94 <lastmod>2026-03-04</lastmod>
95 </url>
96 <url>
97 <loc>https://elastisys.io/welkin/glossary/</loc>
98 <lastmod>2026-03-04</lastmod>
99 </url>
100</urlset>"#;
101
102 let expected = vec![
103 RawUri {
104 text: "https://elastisys.io/welkin/".to_string(),
105 element: Some("loc".to_string()),
106 attribute: None,
107 span: span(4, 15),
108 },
109 RawUri {
110 text: "https://elastisys.io/welkin/architecture/".to_string(),
111 element: Some("loc".to_string()),
112 attribute: None,
113 span: span(8, 15),
114 },
115 RawUri {
116 text: "https://elastisys.io/welkin/glossary/".to_string(),
117 element: Some("loc".to_string()),
118 attribute: None,
119 span: span(12, 15),
120 },
121 ];
122
123 let uris = extract(input);
124
125 assert_eq!(uris, expected);
126 }
127
128 #[test]
129 fn test_extract_rss_links() {
130 let input = r#"<?xml version="1.0" encoding="UTF-8"?>
132<rss version="2.0">
133 <channel>
134 <title>Example Feed</title>
135 <link>https://example.com</link>
136 <description>Example RSS Feed</description>
137 <item>
138 <title>Example Item</title>
139 <link>https://example.com/item</link>
140 <description>Example Item Description</description>
141 </item>
142 </channel>
143</rss>"#;
144
145 let expected = vec![
146 RawUri {
147 text: "https://example.com".to_string(),
148 element: Some("link".to_string()),
149 attribute: None,
150 span: span(5, 15),
151 },
152 RawUri {
153 text: "https://example.com/item".to_string(),
154 element: Some("link".to_string()),
155 attribute: None,
156 span: span(9, 19),
157 },
158 ];
159
160 let uris = extract(input);
161
162 assert_eq!(uris, expected);
163 }
164
165 #[test]
166 fn test_extract_atom_links() {
167 let input = r#"<?xml version="1.0" encoding="utf-8"?>
169<feed xmlns="http://www.w3.org/2005/Atom">
170 <title>Example Feed</title>
171 <link href="https://example.com" />
172 <updated>2026-03-04T12:00:00Z</updated>
173 <author>
174 <name>John Doe</name>
175 </author>
176 <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
177 <entry>
178 <title>Example Entry</title>
179 <link href="https://example.com/entry" />
180 <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
181 <updated>2026-03-04T12:00:00Z</updated>
182 <summary>Example Entry Summary</summary>
183 </entry>
184</feed>"#;
185
186 let expected = vec![
187 RawUri {
188 text: "https://example.com".to_string(),
189 element: Some("link".to_string()),
190 attribute: Some("href".to_string()),
191 span: span(4, 40),
192 },
193 RawUri {
194 text: "https://example.com/entry".to_string(),
195 element: Some("link".to_string()),
196 attribute: Some("href".to_string()),
197 span: span(12, 50),
198 },
199 ];
200
201 let uris = extract(input);
202
203 assert_eq!(uris, expected);
204 }
205}