1use url::Url;
2
3#[derive(Debug, Clone)]
4pub struct SitemapUrl {
5 pub loc: String,
6 pub lastmod: Option<String>,
7 pub changefreq: Option<String>,
8 pub priority: Option<f32>,
9}
10
11#[derive(Debug, Clone)]
12pub enum SitemapEntry {
13 Url(SitemapUrl),
14 Index(String),
15}
16
17pub fn parse_sitemap(content: &str) -> Vec<SitemapEntry> {
18 let mut entries = Vec::new();
19 let mut current_url: Option<SitemapUrl> = None;
20 let mut in_url = false;
21 let mut in_sitemap = false;
22
23 for line in content.lines() {
24 let line = line.trim();
25
26 if line.starts_with("<url>") {
27 in_url = true;
28 current_url = Some(SitemapUrl {
29 loc: String::new(),
30 lastmod: None,
31 changefreq: None,
32 priority: None,
33 });
34 } else if line.starts_with("</url>") {
35 in_url = false;
36 if let Some(url) = current_url.take() {
37 if !url.loc.is_empty() {
38 entries.push(SitemapEntry::Url(url));
39 }
40 }
41 } else if line.starts_with("<sitemap>") {
42 in_sitemap = true;
43 } else if line.starts_with("</sitemap>") {
44 in_sitemap = false;
45 } else if in_url || in_sitemap {
46 if let Some(tag_start) = line.find('<') {
47 if let Some(tag_end) = line.find('>') {
48 let tag = &line[tag_start + 1..tag_end];
49 if !tag.starts_with('/') {
50 let content_start = tag_end + 1;
51 if let Some(close_start) = line[content_start..].find('<') {
52 let tag_content = line[content_start..content_start + close_start]
53 .trim()
54 .to_string();
55
56 if in_url {
57 if let Some(ref mut url) = current_url {
58 match tag {
59 "loc" => url.loc = tag_content,
60 "lastmod" => url.lastmod = Some(tag_content),
61 "changefreq" => url.changefreq = Some(tag_content),
62 "priority" => url.priority = tag_content.parse().ok(),
63 _ => {}
64 }
65 }
66 } else if in_sitemap && tag == "loc" {
67 entries.push(SitemapEntry::Index(tag_content));
68 }
69 }
70 }
71 }
72 }
73 }
74 }
75
76 entries
77}
78
79pub fn discover_sitemap_urls(base_url: &str) -> Vec<String> {
80 let base = match Url::parse(base_url) {
81 Ok(u) => u,
82 Err(_) => return vec![],
83 };
84
85 let origin = format!(
86 "{}://{}{}",
87 base.scheme(),
88 base.host_str().unwrap_or(""),
89 if let Some(port) = base.port() {
90 format!(":{}", port)
91 } else {
92 String::new()
93 }
94 );
95
96 vec![
97 format!("{}/sitemap.xml", origin),
98 format!("{}/sitemap_index.xml", origin),
99 format!("{}/sitemap-index.xml", origin),
100 format!("{}/robots.txt", origin),
101 ]
102}
103
104#[cfg(test)]
105mod tests {
106 use super::*;
107
108 #[test]
109 fn parse_simple_sitemap() {
110 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
111<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
112 <url>
113 <loc>https://example.com/page1</loc>
114 <lastmod>2024-01-01</lastmod>
115 <changefreq>daily</changefreq>
116 <priority>0.8</priority>
117 </url>
118 <url>
119 <loc>https://example.com/page2</loc>
120 </url>
121</urlset>"#;
122
123 let entries = parse_sitemap(xml);
124 assert_eq!(entries.len(), 2);
125
126 match &entries[0] {
127 SitemapEntry::Url(url) => {
128 assert_eq!(url.loc, "https://example.com/page1");
129 assert_eq!(url.lastmod, Some("2024-01-01".to_string()));
130 assert_eq!(url.changefreq, Some("daily".to_string()));
131 assert_eq!(url.priority, Some(0.8));
132 }
133 _ => panic!("Expected URL entry"),
134 }
135
136 match &entries[1] {
137 SitemapEntry::Url(url) => {
138 assert_eq!(url.loc, "https://example.com/page2");
139 assert_eq!(url.lastmod, None);
140 }
141 _ => panic!("Expected URL entry"),
142 }
143 }
144
145 #[test]
146 fn parse_sitemap_index() {
147 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
148<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
149 <sitemap>
150 <loc>https://example.com/sitemap1.xml</loc>
151 </sitemap>
152 <sitemap>
153 <loc>https://example.com/sitemap2.xml</loc>
154 </sitemap>
155</sitemapindex>"#;
156
157 let entries = parse_sitemap(xml);
158 assert_eq!(entries.len(), 2);
159
160 match &entries[0] {
161 SitemapEntry::Index(url) => {
162 assert_eq!(url, "https://example.com/sitemap1.xml");
163 }
164 _ => panic!("Expected Index entry"),
165 }
166 }
167
168 #[test]
169 fn discover_sitemap_urls_generates_common_paths() {
170 let urls = discover_sitemap_urls("https://example.com/page");
171 assert!(urls.contains(&"https://example.com/sitemap.xml".to_string()));
172 assert!(urls.contains(&"https://example.com/sitemap_index.xml".to_string()));
173 assert!(urls.contains(&"https://example.com/robots.txt".to_string()));
174 }
175
176 #[test]
177 fn parse_empty_sitemap() {
178 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
179<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
180</urlset>"#;
181
182 let entries = parse_sitemap(xml);
183 assert_eq!(entries.len(), 0);
184 }
185}