Skip to main content

stygian_graph/adapters/
sitemap.rs

1//! Sitemap / sitemap-index [`ScrapingService`](crate::ports::ScrapingService) adapter
2//!
3//! Parses XML sitemaps (`<urlset>`) and sitemap index files (`<sitemapindex>`),
4//! emitting discovered URLs with metadata for downstream pipeline nodes.
5//!
6//! Supports:
7//! - Standard sitemaps (`<urlset>` with `<url>` entries)
8//! - Sitemap index files (`<sitemapindex>` with nested `<sitemap>` refs)
9//! - Gzipped sitemaps (`.xml.gz`) via `flate2`
10//! - Filtering by `lastmod` date range or `priority` threshold
11//!
12//! # Example
13//!
14//! ```no_run
15//! use stygian_graph::adapters::sitemap::SitemapAdapter;
16//! use stygian_graph::ports::{ScrapingService, ServiceInput};
17//! use serde_json::json;
18//!
19//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
20//! let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
21//! let input = ServiceInput {
22//!     url: "https://example.com/sitemap.xml".into(),
23//!     params: json!({}),
24//! };
25//! let output = adapter.execute(input).await.unwrap();
26//! println!("{}", output.data); // JSON array of discovered URLs
27//! # });
28//! ```
29
30use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40// ─── Domain types ─────────────────────────────────────────────────────────────
41
42/// A single URL entry extracted from a sitemap.
43///
44/// # Example
45///
46/// ```
47/// use stygian_graph::adapters::sitemap::SitemapEntry;
48///
49/// let entry = SitemapEntry {
50///     loc: "https://example.com/page".into(),
51///     lastmod: Some("2026-03-01".into()),
52///     changefreq: Some("weekly".into()),
53///     priority: Some(0.8),
54/// };
55/// ```
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58    /// Absolute URL.
59    pub loc: String,
60    /// Last-modified date string (ISO 8601).
61    pub lastmod: Option<String>,
62    /// Change frequency hint.
63    pub changefreq: Option<String>,
64    /// Priority (0.0–1.0).
65    pub priority: Option<f64>,
66}
67
68// ─── Adapter ──────────────────────────────────────────────────────────────────
69
70/// Sitemap / sitemap-index source adapter.
71///
72/// Fetches and parses XML sitemaps, recursively resolving sitemap index files
73/// up to a configurable depth limit.
74///
75/// # Example
76///
77/// ```no_run
78/// use stygian_graph::adapters::sitemap::SitemapAdapter;
79///
80/// let adapter = SitemapAdapter::new(reqwest::Client::new(), 3);
81/// ```
82pub struct SitemapAdapter {
83    client: reqwest::Client,
84    max_depth: usize,
85}
86
87impl SitemapAdapter {
88    /// Create a new sitemap adapter.
89    ///
90    /// `max_depth` controls how many levels of sitemap-index nesting to follow.
91    ///
92    /// # Example
93    ///
94    /// ```
95    /// use stygian_graph::adapters::sitemap::SitemapAdapter;
96    ///
97    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
98    /// ```
99    pub fn new(client: reqwest::Client, max_depth: usize) -> Self {
100        Self { client, max_depth }
101    }
102
103    /// Fetch raw bytes from a URL, transparently decompressing `.xml.gz`.
104    ///
105    /// # Errors
106    ///
107    /// Returns [`StygianError::Service`] on HTTP or decompression failure.
108    async fn fetch_bytes(&self, url: &str) -> Result<String> {
109        let resp = self.client.get(url).send().await.map_err(|e| {
110            StygianError::Service(ServiceError::Unavailable(format!(
111                "sitemap fetch failed: {e}"
112            )))
113        })?;
114
115        if !resp.status().is_success() {
116            return Err(StygianError::Service(ServiceError::InvalidResponse(
117                format!("sitemap returned HTTP {}", resp.status()),
118            )));
119        }
120
121        let bytes = resp.bytes().await.map_err(|e| {
122            StygianError::Service(ServiceError::Unavailable(format!(
123                "sitemap body read failed: {e}"
124            )))
125        })?;
126
127        // Attempt gzip decompression if URL ends in .gz or content looks gzipped
128        if url.ends_with(".gz") || (bytes.len() >= 2 && bytes[0] == 0x1f && bytes[1] == 0x8b) {
129            let mut decoder = GzDecoder::new(&bytes[..]);
130            let mut xml = String::new();
131            decoder.read_to_string(&mut xml).map_err(|e| {
132                StygianError::Service(ServiceError::InvalidResponse(format!(
133                    "gzip decompression failed: {e}"
134                )))
135            })?;
136            Ok(xml)
137        } else {
138            String::from_utf8(bytes.to_vec()).map_err(|e| {
139                StygianError::Service(ServiceError::InvalidResponse(format!(
140                    "sitemap not valid UTF-8: {e}"
141                )))
142            })
143        }
144    }
145
146    /// Recursively resolve a sitemap URL, returning all discovered entries.
147    ///
148    /// # Errors
149    ///
150    /// Returns [`StygianError::Service`] on fetch, parse, or depth-limit errors.
151    async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
152        if depth > self.max_depth {
153            return Err(StygianError::Service(ServiceError::InvalidResponse(
154                format!(
155                    "sitemap index nesting exceeded max depth ({depth} > {})",
156                    self.max_depth
157                ),
158            )));
159        }
160
161        let xml = self.fetch_bytes(url).await?;
162        let root_kind = detect_root_element(&xml)?;
163
164        match root_kind {
165            RootElement::UrlSet => parse_urlset(&xml),
166            RootElement::SitemapIndex => {
167                let nested_urls = parse_sitemapindex(&xml)?;
168                let mut all = Vec::new();
169                for nested_url in &nested_urls {
170                    let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
171                    all.extend(entries);
172                }
173                Ok(all)
174            }
175        }
176    }
177}
178
179#[async_trait]
180impl ScrapingService for SitemapAdapter {
181    /// Fetch and parse a sitemap, returning discovered URLs as JSON.
182    ///
183    /// # Params (optional)
184    ///
185    /// * `min_priority` — f64, filter entries with priority >= this value.
186    /// * `lastmod_after` — string, include only entries with lastmod >= this date.
187    /// * `lastmod_before` — string, include only entries with lastmod <= this date.
188    ///
189    /// # Example
190    ///
191    /// ```no_run
192    /// # use stygian_graph::adapters::sitemap::SitemapAdapter;
193    /// # use stygian_graph::ports::{ScrapingService, ServiceInput};
194    /// # use serde_json::json;
195    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
196    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
197    /// let input = ServiceInput {
198    ///     url: "https://example.com/sitemap.xml".into(),
199    ///     params: json!({ "min_priority": 0.5 }),
200    /// };
201    /// let out = adapter.execute(input).await.unwrap();
202    /// # });
203    /// ```
204    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
205        let mut entries = self.resolve(&input.url, 0).await?;
206
207        // Apply optional filters
208        if let Some(min_pri) = input.params.get("min_priority").and_then(|v| v.as_f64()) {
209            entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
210        }
211        if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
212            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
213        }
214        if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
215            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
216        }
217
218        let count = entries.len();
219        let data = serde_json::to_string(&entries).map_err(|e| {
220            StygianError::Service(ServiceError::InvalidResponse(format!(
221                "sitemap serialization failed: {e}"
222            )))
223        })?;
224
225        Ok(ServiceOutput {
226            data,
227            metadata: json!({
228                "source": "sitemap",
229                "url_count": count,
230                "source_url": input.url,
231            }),
232        })
233    }
234
235    fn name(&self) -> &'static str {
236        "sitemap"
237    }
238}
239
240// ─── XML parsing helpers ──────────────────────────────────────────────────────
241
242#[derive(Debug, PartialEq)]
243enum RootElement {
244    UrlSet,
245    SitemapIndex,
246}
247
248/// Detect whether the XML document is a `<urlset>` or `<sitemapindex>`.
249fn detect_root_element(xml: &str) -> Result<RootElement> {
250    let mut reader = Reader::from_str(xml);
251    let mut buf = Vec::new();
252
253    loop {
254        match reader.read_event_into(&mut buf) {
255            Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
256                let local = e.local_name();
257                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
258                return match name {
259                    "urlset" => Ok(RootElement::UrlSet),
260                    "sitemapindex" => Ok(RootElement::SitemapIndex),
261                    _ => Err(StygianError::Service(ServiceError::InvalidResponse(
262                        format!("unexpected XML root element: <{name}>"),
263                    ))),
264                };
265            }
266            Ok(Event::Eof) => {
267                return Err(StygianError::Service(ServiceError::InvalidResponse(
268                    "empty or invalid XML document".into(),
269                )));
270            }
271            Err(e) => {
272                return Err(StygianError::Service(ServiceError::InvalidResponse(
273                    format!("XML parse error: {e}"),
274                )));
275            }
276            _ => {} // skip processing instructions, comments, decl
277        }
278        buf.clear();
279    }
280}
281
282/// Parse a `<urlset>` document into a list of [`SitemapEntry`].
283fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
284    let mut reader = Reader::from_str(xml);
285    let mut buf = Vec::new();
286    let mut entries = Vec::new();
287
288    // Current entry being built
289    let mut current: Option<SitemapEntryBuilder> = None;
290    let mut current_tag: Option<String> = None;
291
292    loop {
293        match reader.read_event_into(&mut buf) {
294            Ok(Event::Start(ref e)) => {
295                let name = local_name(e);
296                match name.as_str() {
297                    "url" => {
298                        current = Some(SitemapEntryBuilder::default());
299                    }
300                    "loc" | "lastmod" | "changefreq" | "priority" => {
301                        current_tag = Some(name);
302                    }
303                    _ => {}
304                }
305            }
306            Ok(Event::Text(ref t)) => {
307                if let (Some(builder), Some(tag)) = (&mut current, &current_tag) {
308                    let text = t.unescape().unwrap_or_default().trim().to_string();
309                    if !text.is_empty() {
310                        match tag.as_str() {
311                            "loc" => builder.loc = Some(text),
312                            "lastmod" => builder.lastmod = Some(text),
313                            "changefreq" => builder.changefreq = Some(text),
314                            "priority" => builder.priority = text.parse().ok(),
315                            _ => {}
316                        }
317                    }
318                }
319            }
320            Ok(Event::End(ref e)) => {
321                let name = local_name_end(e);
322                if name == "url"
323                    && let Some(builder) = current.take()
324                    && let Some(entry) = builder.build()
325                {
326                    entries.push(entry);
327                }
328                if current_tag.as_deref() == Some(&name) {
329                    current_tag = None;
330                }
331            }
332            Ok(Event::Eof) => break,
333            Err(e) => {
334                return Err(StygianError::Service(ServiceError::InvalidResponse(
335                    format!("sitemap XML parse error: {e}"),
336                )));
337            }
338            _ => {}
339        }
340        buf.clear();
341    }
342
343    Ok(entries)
344}
345
346/// Parse a `<sitemapindex>` document, returning the `<loc>` URLs of nested sitemaps.
347fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
348    let mut reader = Reader::from_str(xml);
349    let mut buf = Vec::new();
350    let mut urls = Vec::new();
351    let mut in_sitemap = false;
352    let mut in_loc = false;
353
354    loop {
355        match reader.read_event_into(&mut buf) {
356            Ok(Event::Start(ref e)) => {
357                let name = local_name(e);
358                match name.as_str() {
359                    "sitemap" => in_sitemap = true,
360                    "loc" if in_sitemap => in_loc = true,
361                    _ => {}
362                }
363            }
364            Ok(Event::Text(ref t)) => {
365                if in_loc {
366                    let text = t.unescape().unwrap_or_default().trim().to_string();
367                    if !text.is_empty() {
368                        urls.push(text);
369                    }
370                }
371            }
372            Ok(Event::End(ref e)) => {
373                let name = local_name_end(e);
374                match name.as_str() {
375                    "sitemap" => {
376                        in_sitemap = false;
377                        in_loc = false;
378                    }
379                    "loc" => in_loc = false,
380                    _ => {}
381                }
382            }
383            Ok(Event::Eof) => break,
384            Err(e) => {
385                return Err(StygianError::Service(ServiceError::InvalidResponse(
386                    format!("sitemapindex XML parse error: {e}"),
387                )));
388            }
389            _ => {}
390        }
391        buf.clear();
392    }
393
394    Ok(urls)
395}
396
397/// Extract the local name (without namespace prefix) from a start element.
398fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
399    std::str::from_utf8(e.local_name().as_ref())
400        .unwrap_or("")
401        .to_string()
402}
403
404/// Extract the local name from an end element.
405fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
406    std::str::from_utf8(e.local_name().as_ref())
407        .unwrap_or("")
408        .to_string()
409}
410
411// ─── Builder ──────────────────────────────────────────────────────────────────
412
413#[derive(Default)]
414struct SitemapEntryBuilder {
415    loc: Option<String>,
416    lastmod: Option<String>,
417    changefreq: Option<String>,
418    priority: Option<f64>,
419}
420
421impl SitemapEntryBuilder {
422    fn build(self) -> Option<SitemapEntry> {
423        Some(SitemapEntry {
424            loc: self.loc?,
425            lastmod: self.lastmod,
426            changefreq: self.changefreq,
427            priority: self.priority,
428        })
429    }
430}
431
432// ─── Tests ────────────────────────────────────────────────────────────────────
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
439<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
440  <url>
441    <loc>https://example.com/page1</loc>
442    <lastmod>2026-03-01</lastmod>
443    <changefreq>daily</changefreq>
444    <priority>0.8</priority>
445  </url>
446  <url>
447    <loc>https://example.com/page2</loc>
448    <lastmod>2026-02-15</lastmod>
449    <priority>0.5</priority>
450  </url>
451  <url>
452    <loc>https://example.com/page3</loc>
453  </url>
454</urlset>"#;
455
456    const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
457<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
458  <sitemap>
459    <loc>https://example.com/sitemap1.xml</loc>
460    <lastmod>2026-03-01</lastmod>
461  </sitemap>
462  <sitemap>
463    <loc>https://example.com/sitemap2.xml.gz</loc>
464  </sitemap>
465</sitemapindex>"#;
466
467    #[test]
468    fn parse_urlset_with_3_urls() {
469        let entries = parse_urlset(URLSET_XML).expect("parse");
470        assert_eq!(entries.len(), 3);
471
472        assert_eq!(entries[0].loc, "https://example.com/page1");
473        assert_eq!(entries[0].lastmod.as_deref(), Some("2026-03-01"));
474        assert_eq!(entries[0].changefreq.as_deref(), Some("daily"));
475        assert_eq!(entries[0].priority, Some(0.8));
476
477        assert_eq!(entries[1].loc, "https://example.com/page2");
478        assert_eq!(entries[1].priority, Some(0.5));
479        assert!(entries[1].changefreq.is_none());
480
481        assert_eq!(entries[2].loc, "https://example.com/page3");
482        assert!(entries[2].lastmod.is_none());
483        assert!(entries[2].priority.is_none());
484    }
485
486    #[test]
487    fn parse_sitemapindex_extracts_nested_urls() {
488        let urls = parse_sitemapindex(SITEMAPINDEX_XML).expect("parse");
489        assert_eq!(urls.len(), 2);
490        assert_eq!(urls[0], "https://example.com/sitemap1.xml");
491        assert_eq!(urls[1], "https://example.com/sitemap2.xml.gz");
492    }
493
494    #[test]
495    fn detect_root_urlset() {
496        let root = detect_root_element(URLSET_XML).expect("detect");
497        assert_eq!(root, RootElement::UrlSet);
498    }
499
500    #[test]
501    fn detect_root_sitemapindex() {
502        let root = detect_root_element(SITEMAPINDEX_XML).expect("detect");
503        assert_eq!(root, RootElement::SitemapIndex);
504    }
505
506    #[test]
507    fn filter_by_lastmod_range() {
508        let mut entries = parse_urlset(URLSET_XML).expect("parse");
509        // Only entries on or after 2026-03-01
510        entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
511        assert_eq!(entries.len(), 1);
512        assert_eq!(entries[0].loc, "https://example.com/page1");
513    }
514
515    #[test]
516    fn filter_by_priority_threshold() {
517        let mut entries = parse_urlset(URLSET_XML).expect("parse");
518        entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
519        assert_eq!(entries.len(), 1);
520        assert_eq!(entries[0].loc, "https://example.com/page1");
521    }
522
523    #[test]
524    fn gzip_decompression() {
525        use flate2::Compression;
526        use flate2::write::GzEncoder;
527        use std::io::Write;
528
529        let xml = URLSET_XML;
530        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
531        encoder.write_all(xml.as_bytes()).expect("gz write");
532        let compressed = encoder.finish().expect("gz finish");
533
534        // Decompress and parse
535        let mut decoder = GzDecoder::new(&compressed[..]);
536        let mut decompressed = String::new();
537        decoder.read_to_string(&mut decompressed).expect("gz read");
538
539        let entries = parse_urlset(&decompressed).expect("parse");
540        assert_eq!(entries.len(), 3);
541    }
542
543    #[test]
544    fn malformed_xml_returns_error() {
545        let bad = "<not-a-sitemap><broken";
546        let result = detect_root_element(bad);
547        assert!(result.is_err());
548    }
549
550    #[test]
551    fn empty_xml_returns_error() {
552        let result = detect_root_element("");
553        assert!(result.is_err());
554    }
555
556    #[test]
557    fn unexpected_root_element_returns_error() {
558        let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
559        let result = detect_root_element(xml);
560        assert!(result.is_err());
561    }
562
563    #[test]
564    fn urlset_with_no_urls_returns_empty() {
565        let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
566        let entries = parse_urlset(xml).expect("parse");
567        assert!(entries.is_empty());
568    }
569
570    #[test]
571    fn url_without_loc_is_skipped() {
572        let xml = r#"<?xml version="1.0"?>
573<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
574  <url>
575    <lastmod>2026-01-01</lastmod>
576  </url>
577  <url>
578    <loc>https://example.com/valid</loc>
579  </url>
580</urlset>"#;
581        let entries = parse_urlset(xml).expect("parse");
582        assert_eq!(entries.len(), 1);
583        assert_eq!(entries[0].loc, "https://example.com/valid");
584    }
585}