Skip to main content

argus_parser/
sitemap.rs

1use url::Url;
2
3#[derive(Debug, Clone)]
4pub struct SitemapUrl {
5    pub loc: String,
6    pub lastmod: Option<String>,
7    pub changefreq: Option<String>,
8    pub priority: Option<f32>,
9}
10
11#[derive(Debug, Clone)]
12pub enum SitemapEntry {
13    Url(SitemapUrl),
14    Index(String),
15}
16
17pub fn parse_sitemap(content: &str) -> Vec<SitemapEntry> {
18    let mut entries = Vec::new();
19    let mut current_url: Option<SitemapUrl> = None;
20    let mut in_url = false;
21    let mut in_sitemap = false;
22
23    for line in content.lines() {
24        let line = line.trim();
25
26        if line.starts_with("<url>") {
27            in_url = true;
28            current_url = Some(SitemapUrl {
29                loc: String::new(),
30                lastmod: None,
31                changefreq: None,
32                priority: None,
33            });
34        } else if line.starts_with("</url>") {
35            in_url = false;
36            if let Some(url) = current_url.take() {
37                if !url.loc.is_empty() {
38                    entries.push(SitemapEntry::Url(url));
39                }
40            }
41        } else if line.starts_with("<sitemap>") {
42            in_sitemap = true;
43        } else if line.starts_with("</sitemap>") {
44            in_sitemap = false;
45        } else if in_url || in_sitemap {
46            if let Some(tag_start) = line.find('<') {
47                if let Some(tag_end) = line.find('>') {
48                    let tag = &line[tag_start + 1..tag_end];
49                    if !tag.starts_with('/') {
50                        let content_start = tag_end + 1;
51                        if let Some(close_start) = line[content_start..].find('<') {
52                            let tag_content = line[content_start..content_start + close_start]
53                                .trim()
54                                .to_string();
55
56                            if in_url {
57                                if let Some(ref mut url) = current_url {
58                                    match tag {
59                                        "loc" => url.loc = tag_content,
60                                        "lastmod" => url.lastmod = Some(tag_content),
61                                        "changefreq" => url.changefreq = Some(tag_content),
62                                        "priority" => url.priority = tag_content.parse().ok(),
63                                        _ => {}
64                                    }
65                                }
66                            } else if in_sitemap && tag == "loc" {
67                                entries.push(SitemapEntry::Index(tag_content));
68                            }
69                        }
70                    }
71                }
72            }
73        }
74    }
75
76    entries
77}
78
79pub fn discover_sitemap_urls(base_url: &str) -> Vec<String> {
80    let base = match Url::parse(base_url) {
81        Ok(u) => u,
82        Err(_) => return vec![],
83    };
84
85    let origin = format!(
86        "{}://{}{}",
87        base.scheme(),
88        base.host_str().unwrap_or(""),
89        if let Some(port) = base.port() {
90            format!(":{}", port)
91        } else {
92            String::new()
93        }
94    );
95
96    vec![
97        format!("{}/sitemap.xml", origin),
98        format!("{}/sitemap_index.xml", origin),
99        format!("{}/sitemap-index.xml", origin),
100        format!("{}/robots.txt", origin),
101    ]
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107
108    #[test]
109    fn parse_simple_sitemap() {
110        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
111<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
112  <url>
113    <loc>https://example.com/page1</loc>
114    <lastmod>2024-01-01</lastmod>
115    <changefreq>daily</changefreq>
116    <priority>0.8</priority>
117  </url>
118  <url>
119    <loc>https://example.com/page2</loc>
120  </url>
121</urlset>"#;
122
123        let entries = parse_sitemap(xml);
124        assert_eq!(entries.len(), 2);
125
126        match &entries[0] {
127            SitemapEntry::Url(url) => {
128                assert_eq!(url.loc, "https://example.com/page1");
129                assert_eq!(url.lastmod, Some("2024-01-01".to_string()));
130                assert_eq!(url.changefreq, Some("daily".to_string()));
131                assert_eq!(url.priority, Some(0.8));
132            }
133            _ => panic!("Expected URL entry"),
134        }
135
136        match &entries[1] {
137            SitemapEntry::Url(url) => {
138                assert_eq!(url.loc, "https://example.com/page2");
139                assert_eq!(url.lastmod, None);
140            }
141            _ => panic!("Expected URL entry"),
142        }
143    }
144
145    #[test]
146    fn parse_sitemap_index() {
147        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
148<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
149  <sitemap>
150    <loc>https://example.com/sitemap1.xml</loc>
151  </sitemap>
152  <sitemap>
153    <loc>https://example.com/sitemap2.xml</loc>
154  </sitemap>
155</sitemapindex>"#;
156
157        let entries = parse_sitemap(xml);
158        assert_eq!(entries.len(), 2);
159
160        match &entries[0] {
161            SitemapEntry::Index(url) => {
162                assert_eq!(url, "https://example.com/sitemap1.xml");
163            }
164            _ => panic!("Expected Index entry"),
165        }
166    }
167
168    #[test]
169    fn discover_sitemap_urls_generates_common_paths() {
170        let urls = discover_sitemap_urls("https://example.com/page");
171        assert!(urls.contains(&"https://example.com/sitemap.xml".to_string()));
172        assert!(urls.contains(&"https://example.com/sitemap_index.xml".to_string()));
173        assert!(urls.contains(&"https://example.com/robots.txt".to_string()));
174    }
175
176    #[test]
177    fn parse_empty_sitemap() {
178        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
179<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
180</urlset>"#;
181
182        let entries = parse_sitemap(xml);
183        assert_eq!(entries.len(), 0);
184    }
185}