Skip to main content

stygian_graph/adapters/
sitemap.rs

1//! Sitemap / sitemap-index [`ScrapingService`](crate::ports::ScrapingService) adapter
2//!
3//! Parses XML sitemaps (`<urlset>`) and sitemap index files (`<sitemapindex>`),
4//! emitting discovered URLs with metadata for downstream pipeline nodes.
5//!
6//! Supports:
7//! - Standard sitemaps (`<urlset>` with `<url>` entries)
8//! - Sitemap index files (`<sitemapindex>` with nested `<sitemap>` refs)
9//! - Gzipped sitemaps (`.xml.gz`) via `flate2`
10//! - Filtering by `lastmod` date range or `priority` threshold
11//!
12//! # Example
13//!
14//! ```no_run
15//! use stygian_graph::adapters::sitemap::SitemapAdapter;
16//! use stygian_graph::ports::{ScrapingService, ServiceInput};
17//! use serde_json::json;
18//!
19//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
20//! let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
21//! let input = ServiceInput {
22//!     url: "https://example.com/sitemap.xml".into(),
23//!     params: json!({}),
24//! };
25//! let output = adapter.execute(input).await.unwrap();
26//! println!("{}", output.data); // JSON array of discovered URLs
27//! # });
28//! ```
29
30use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40// ─── Domain types ─────────────────────────────────────────────────────────────
41
42/// A single URL entry extracted from a sitemap.
43///
44/// # Example
45///
46/// ```
47/// use stygian_graph::adapters::sitemap::SitemapEntry;
48///
49/// let entry = SitemapEntry {
50///     loc: "https://example.com/page".into(),
51///     lastmod: Some("2026-03-01".into()),
52///     changefreq: Some("weekly".into()),
53///     priority: Some(0.8),
54/// };
55/// ```
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58    /// Absolute URL.
59    pub loc: String,
60    /// Last-modified date string (ISO 8601).
61    pub lastmod: Option<String>,
62    /// Change frequency hint.
63    pub changefreq: Option<String>,
64    /// Priority (0.0–1.0).
65    pub priority: Option<f64>,
66}
67
68// ─── Adapter ──────────────────────────────────────────────────────────────────
69
70/// Sitemap / sitemap-index source adapter.
71///
72/// Fetches and parses XML sitemaps, recursively resolving sitemap index files
73/// up to a configurable depth limit.
74///
75/// # Example
76///
77/// ```no_run
78/// use stygian_graph::adapters::sitemap::SitemapAdapter;
79///
80/// let adapter = SitemapAdapter::new(reqwest::Client::new(), 3);
81/// ```
82pub struct SitemapAdapter {
83    client: reqwest::Client,
84    max_depth: usize,
85}
86
87impl SitemapAdapter {
88    /// Create a new sitemap adapter.
89    ///
90    /// `max_depth` controls how many levels of sitemap-index nesting to follow.
91    ///
92    /// # Example
93    ///
94    /// ```
95    /// use stygian_graph::adapters::sitemap::SitemapAdapter;
96    ///
97    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
98    /// ```
99    pub const fn new(client: reqwest::Client, max_depth: usize) -> Self {
100        Self { client, max_depth }
101    }
102
103    /// Fetch raw bytes from a URL, transparently decompressing `.xml.gz`.
104    ///
105    /// # Errors
106    ///
107    /// Returns [`StygianError::Service`] on HTTP or decompression failure.
108    async fn fetch_bytes(&self, url: &str) -> Result<String> {
109        let resp = self.client.get(url).send().await.map_err(|e| {
110            StygianError::Service(ServiceError::Unavailable(format!(
111                "sitemap fetch failed: {e}"
112            )))
113        })?;
114
115        if !resp.status().is_success() {
116            return Err(StygianError::Service(ServiceError::InvalidResponse(
117                format!("sitemap returned HTTP {}", resp.status()),
118            )));
119        }
120
121        let bytes = resp.bytes().await.map_err(|e| {
122            StygianError::Service(ServiceError::Unavailable(format!(
123                "sitemap body read failed: {e}"
124            )))
125        })?;
126
127        // Attempt gzip decompression if URL ends in .gz or content looks gzipped
128        if url.to_ascii_lowercase().ends_with(".gz") || bytes.starts_with(&[0x1f, 0x8b]) {
129            let mut decoder = GzDecoder::new(&bytes[..]);
130            let mut xml = String::new();
131            decoder.read_to_string(&mut xml).map_err(|e| {
132                StygianError::Service(ServiceError::InvalidResponse(format!(
133                    "gzip decompression failed: {e}"
134                )))
135            })?;
136            Ok(xml)
137        } else {
138            String::from_utf8(bytes.to_vec()).map_err(|e| {
139                StygianError::Service(ServiceError::InvalidResponse(format!(
140                    "sitemap not valid UTF-8: {e}"
141                )))
142            })
143        }
144    }
145
146    /// Recursively resolve a sitemap URL, returning all discovered entries.
147    ///
148    /// # Errors
149    ///
150    /// Returns [`StygianError::Service`] on fetch, parse, or depth-limit errors.
151    async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
152        if depth > self.max_depth {
153            return Err(StygianError::Service(ServiceError::InvalidResponse(
154                format!(
155                    "sitemap index nesting exceeded max depth ({depth} > {})",
156                    self.max_depth
157                ),
158            )));
159        }
160
161        let xml = self.fetch_bytes(url).await?;
162        let root_kind = detect_root_element(&xml)?;
163
164        match root_kind {
165            RootElement::UrlSet => parse_urlset(&xml),
166            RootElement::SitemapIndex => {
167                let nested_urls = parse_sitemapindex(&xml)?;
168                let mut all = Vec::new();
169                for nested_url in &nested_urls {
170                    let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
171                    all.extend(entries);
172                }
173                Ok(all)
174            }
175        }
176    }
177}
178
179#[async_trait]
180impl ScrapingService for SitemapAdapter {
181    /// Fetch and parse a sitemap, returning discovered URLs as JSON.
182    ///
183    /// # Params (optional)
184    ///
185    /// * `min_priority` — f64, filter entries with priority >= this value.
186    /// * `lastmod_after` — string, include only entries with lastmod >= this date.
187    /// * `lastmod_before` — string, include only entries with lastmod <= this date.
188    ///
189    /// # Example
190    ///
191    /// ```no_run
192    /// # use stygian_graph::adapters::sitemap::SitemapAdapter;
193    /// # use stygian_graph::ports::{ScrapingService, ServiceInput};
194    /// # use serde_json::json;
195    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
196    /// let adapter = SitemapAdapter::new(reqwest::Client::new(), 5);
197    /// let input = ServiceInput {
198    ///     url: "https://example.com/sitemap.xml".into(),
199    ///     params: json!({ "min_priority": 0.5 }),
200    /// };
201    /// let out = adapter.execute(input).await.unwrap();
202    /// # });
203    /// ```
204    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
205        let mut entries = self.resolve(&input.url, 0).await?;
206
207        // Apply optional filters
208        if let Some(min_pri) = input
209            .params
210            .get("min_priority")
211            .and_then(serde_json::Value::as_f64)
212        {
213            entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
214        }
215        if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
216            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
217        }
218        if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
219            entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
220        }
221
222        let count = entries.len();
223        let data = serde_json::to_string(&entries).map_err(|e| {
224            StygianError::Service(ServiceError::InvalidResponse(format!(
225                "sitemap serialization failed: {e}"
226            )))
227        })?;
228
229        Ok(ServiceOutput {
230            data,
231            metadata: json!({
232                "source": "sitemap",
233                "url_count": count,
234                "source_url": input.url,
235            }),
236        })
237    }
238
239    fn name(&self) -> &'static str {
240        "sitemap"
241    }
242}
243
244// ─── XML parsing helpers ──────────────────────────────────────────────────────
245
246#[derive(Debug, PartialEq)]
247enum RootElement {
248    UrlSet,
249    SitemapIndex,
250}
251
252/// Detect whether the XML document is a `<urlset>` or `<sitemapindex>`.
253fn detect_root_element(xml: &str) -> Result<RootElement> {
254    let mut reader = Reader::from_str(xml);
255    let mut buf = Vec::new();
256
257    loop {
258        match reader.read_event_into(&mut buf) {
259            Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
260                let local = e.local_name();
261                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
262                return match name {
263                    "urlset" => Ok(RootElement::UrlSet),
264                    "sitemapindex" => Ok(RootElement::SitemapIndex),
265                    _ => Err(StygianError::Service(ServiceError::InvalidResponse(
266                        format!("unexpected XML root element: <{name}>"),
267                    ))),
268                };
269            }
270            Ok(Event::Eof) => {
271                return Err(StygianError::Service(ServiceError::InvalidResponse(
272                    "empty or invalid XML document".into(),
273                )));
274            }
275            Err(e) => {
276                return Err(StygianError::Service(ServiceError::InvalidResponse(
277                    format!("XML parse error: {e}"),
278                )));
279            }
280            _ => {} // skip processing instructions, comments, decl
281        }
282        buf.clear();
283    }
284}
285
286/// Parse a `<urlset>` document into a list of [`SitemapEntry`].
287fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
288    let mut reader = Reader::from_str(xml);
289    let mut buf = Vec::new();
290    let mut entries = Vec::new();
291
292    // Current entry being built
293    let mut current: Option<SitemapEntryBuilder> = None;
294    let mut current_tag: Option<String> = None;
295
296    loop {
297        match reader.read_event_into(&mut buf) {
298            Ok(Event::Start(ref e)) => {
299                let name = local_name(e);
300                match name.as_str() {
301                    "url" => {
302                        current = Some(SitemapEntryBuilder::default());
303                    }
304                    "loc" | "lastmod" | "changefreq" | "priority" => {
305                        current_tag = Some(name);
306                    }
307                    _ => {}
308                }
309            }
310            Ok(Event::Text(ref t)) => {
311                if let (Some(builder), Some(tag)) = (&mut current, &current_tag) {
312                    let text = t.unescape().unwrap_or_default().trim().to_string();
313                    if !text.is_empty() {
314                        match tag.as_str() {
315                            "loc" => builder.loc = Some(text),
316                            "lastmod" => builder.lastmod = Some(text),
317                            "changefreq" => builder.changefreq = Some(text),
318                            "priority" => builder.priority = text.parse().ok(),
319                            _ => {}
320                        }
321                    }
322                }
323            }
324            Ok(Event::End(ref e)) => {
325                let name = local_name_end(e);
326                if name == "url"
327                    && let Some(builder) = current.take()
328                    && let Some(entry) = builder.build()
329                {
330                    entries.push(entry);
331                }
332                if current_tag.as_deref() == Some(&name) {
333                    current_tag = None;
334                }
335            }
336            Ok(Event::Eof) => break,
337            Err(e) => {
338                return Err(StygianError::Service(ServiceError::InvalidResponse(
339                    format!("sitemap XML parse error: {e}"),
340                )));
341            }
342            _ => {}
343        }
344        buf.clear();
345    }
346
347    Ok(entries)
348}
349
350/// Parse a `<sitemapindex>` document, returning the `<loc>` URLs of nested sitemaps.
351fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
352    let mut reader = Reader::from_str(xml);
353    let mut buf = Vec::new();
354    let mut urls = Vec::new();
355    let mut in_sitemap = false;
356    let mut in_loc = false;
357
358    loop {
359        match reader.read_event_into(&mut buf) {
360            Ok(Event::Start(ref e)) => {
361                let name = local_name(e);
362                match name.as_str() {
363                    "sitemap" => in_sitemap = true,
364                    "loc" if in_sitemap => in_loc = true,
365                    _ => {}
366                }
367            }
368            Ok(Event::Text(ref t)) => {
369                if in_loc {
370                    let text = t.unescape().unwrap_or_default().trim().to_string();
371                    if !text.is_empty() {
372                        urls.push(text);
373                    }
374                }
375            }
376            Ok(Event::End(ref e)) => {
377                let name = local_name_end(e);
378                match name.as_str() {
379                    "sitemap" => {
380                        in_sitemap = false;
381                        in_loc = false;
382                    }
383                    "loc" => in_loc = false,
384                    _ => {}
385                }
386            }
387            Ok(Event::Eof) => break,
388            Err(e) => {
389                return Err(StygianError::Service(ServiceError::InvalidResponse(
390                    format!("sitemapindex XML parse error: {e}"),
391                )));
392            }
393            _ => {}
394        }
395        buf.clear();
396    }
397
398    Ok(urls)
399}
400
401/// Extract the local name (without namespace prefix) from a start element.
402fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
403    std::str::from_utf8(e.local_name().as_ref())
404        .unwrap_or("")
405        .to_string()
406}
407
408/// Extract the local name from an end element.
409fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
410    std::str::from_utf8(e.local_name().as_ref())
411        .unwrap_or("")
412        .to_string()
413}
414
415// ─── Builder ──────────────────────────────────────────────────────────────────
416
417#[derive(Default)]
418struct SitemapEntryBuilder {
419    loc: Option<String>,
420    lastmod: Option<String>,
421    changefreq: Option<String>,
422    priority: Option<f64>,
423}
424
425impl SitemapEntryBuilder {
426    fn build(self) -> Option<SitemapEntry> {
427        Some(SitemapEntry {
428            loc: self.loc?,
429            lastmod: self.lastmod,
430            changefreq: self.changefreq,
431            priority: self.priority,
432        })
433    }
434}
435
436// ─── Tests ────────────────────────────────────────────────────────────────────
437
438#[cfg(test)]
439mod tests {
440    use super::*;
441
442    const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
443<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
444  <url>
445    <loc>https://example.com/page1</loc>
446    <lastmod>2026-03-01</lastmod>
447    <changefreq>daily</changefreq>
448    <priority>0.8</priority>
449  </url>
450  <url>
451    <loc>https://example.com/page2</loc>
452    <lastmod>2026-02-15</lastmod>
453    <priority>0.5</priority>
454  </url>
455  <url>
456    <loc>https://example.com/page3</loc>
457  </url>
458</urlset>"#;
459
460    const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
461<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
462  <sitemap>
463    <loc>https://example.com/sitemap1.xml</loc>
464    <lastmod>2026-03-01</lastmod>
465  </sitemap>
466  <sitemap>
467    <loc>https://example.com/sitemap2.xml.gz</loc>
468  </sitemap>
469</sitemapindex>"#;
470
471    #[test]
472    fn parse_urlset_with_3_urls() -> std::result::Result<(), Box<dyn std::error::Error>> {
473        let entries = parse_urlset(URLSET_XML)?;
474        assert_eq!(entries.len(), 3);
475
476        let first = entries.first().ok_or("missing first entry")?;
477        assert_eq!(first.loc, "https://example.com/page1");
478        assert_eq!(first.lastmod.as_deref(), Some("2026-03-01"));
479        assert_eq!(first.changefreq.as_deref(), Some("daily"));
480        assert_eq!(first.priority, Some(0.8));
481
482        let second = entries.get(1).ok_or("missing second entry")?;
483        assert_eq!(second.loc, "https://example.com/page2");
484        assert_eq!(second.priority, Some(0.5));
485        assert!(second.changefreq.is_none());
486
487        let third = entries.get(2).ok_or("missing third entry")?;
488        assert_eq!(third.loc, "https://example.com/page3");
489        assert!(third.lastmod.is_none());
490        assert!(third.priority.is_none());
491
492        Ok(())
493    }
494
495    #[test]
496    fn parse_sitemapindex_extracts_nested_urls()
497    -> std::result::Result<(), Box<dyn std::error::Error>> {
498        let urls = parse_sitemapindex(SITEMAPINDEX_XML)?;
499        assert_eq!(urls.len(), 2);
500        assert_eq!(
501            urls.first().map(String::as_str),
502            Some("https://example.com/sitemap1.xml")
503        );
504        assert_eq!(
505            urls.get(1).map(String::as_str),
506            Some("https://example.com/sitemap2.xml.gz")
507        );
508        Ok(())
509    }
510
511    #[test]
512    fn detect_root_urlset() -> std::result::Result<(), Box<dyn std::error::Error>> {
513        let root = detect_root_element(URLSET_XML)?;
514        assert_eq!(root, RootElement::UrlSet);
515        Ok(())
516    }
517
518    #[test]
519    fn detect_root_sitemapindex() -> std::result::Result<(), Box<dyn std::error::Error>> {
520        let root = detect_root_element(SITEMAPINDEX_XML)?;
521        assert_eq!(root, RootElement::SitemapIndex);
522        Ok(())
523    }
524
525    #[test]
526    fn filter_by_lastmod_range() -> std::result::Result<(), Box<dyn std::error::Error>> {
527        let mut entries = parse_urlset(URLSET_XML)?;
528        // Only entries on or after 2026-03-01
529        entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
530        assert_eq!(entries.len(), 1);
531        assert_eq!(
532            entries.first().map(|entry| entry.loc.as_str()),
533            Some("https://example.com/page1")
534        );
535        Ok(())
536    }
537
538    #[test]
539    fn filter_by_priority_threshold() -> std::result::Result<(), Box<dyn std::error::Error>> {
540        let mut entries = parse_urlset(URLSET_XML)?;
541        entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
542        assert_eq!(entries.len(), 1);
543        assert_eq!(
544            entries.first().map(|entry| entry.loc.as_str()),
545            Some("https://example.com/page1")
546        );
547        Ok(())
548    }
549
550    #[test]
551    fn gzip_decompression() -> std::result::Result<(), Box<dyn std::error::Error>> {
552        use flate2::Compression;
553        use flate2::write::GzEncoder;
554        use std::io::Write;
555
556        let xml = URLSET_XML;
557        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
558        encoder.write_all(xml.as_bytes())?;
559        let compressed = encoder.finish()?;
560
561        // Decompress and parse
562        let mut decoder = GzDecoder::new(&compressed[..]);
563        let mut decompressed = String::new();
564        decoder.read_to_string(&mut decompressed)?;
565
566        let entries = parse_urlset(&decompressed)?;
567        assert_eq!(entries.len(), 3);
568        Ok(())
569    }
570
571    #[test]
572    fn malformed_xml_returns_error() {
573        let bad = "<not-a-sitemap><broken";
574        let result = detect_root_element(bad);
575        assert!(result.is_err());
576    }
577
578    #[test]
579    fn empty_xml_returns_error() {
580        let result = detect_root_element("");
581        assert!(result.is_err());
582    }
583
584    #[test]
585    fn unexpected_root_element_returns_error() {
586        let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
587        let result = detect_root_element(xml);
588        assert!(result.is_err());
589    }
590
591    #[test]
592    fn urlset_with_no_urls_returns_empty() -> std::result::Result<(), Box<dyn std::error::Error>> {
593        let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
594        let entries = parse_urlset(xml)?;
595        assert!(entries.is_empty());
596        Ok(())
597    }
598
599    #[test]
600    fn url_without_loc_is_skipped() -> std::result::Result<(), Box<dyn std::error::Error>> {
601        let xml = r#"<?xml version="1.0"?>
602<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
603  <url>
604    <lastmod>2026-01-01</lastmod>
605  </url>
606  <url>
607    <loc>https://example.com/valid</loc>
608  </url>
609</urlset>"#;
610        let entries = parse_urlset(xml)?;
611        assert_eq!(entries.len(), 1);
612        assert_eq!(
613            entries.first().map(|entry| entry.loc.as_str()),
614            Some("https://example.com/valid")
615        );
616        Ok(())
617    }
618}