1use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58 pub loc: String,
60 pub lastmod: Option<String>,
62 pub changefreq: Option<String>,
64 pub priority: Option<f64>,
66}
67
68pub struct SitemapAdapter {
83 client: reqwest::Client,
84 max_depth: usize,
85}
86
87impl SitemapAdapter {
88 #[must_use]
100 pub const fn new(client: reqwest::Client, max_depth: usize) -> Self {
101 Self { client, max_depth }
102 }
103
104 async fn fetch_bytes(&self, url: &str) -> Result<String> {
110 let resp = self.client.get(url).send().await.map_err(|e| {
111 StygianError::Service(ServiceError::Unavailable(format!(
112 "sitemap fetch failed: {e}"
113 )))
114 })?;
115
116 if !resp.status().is_success() {
117 return Err(StygianError::Service(ServiceError::InvalidResponse(
118 format!("sitemap returned HTTP {}", resp.status()),
119 )));
120 }
121
122 let bytes = resp.bytes().await.map_err(|e| {
123 StygianError::Service(ServiceError::Unavailable(format!(
124 "sitemap body read failed: {e}"
125 )))
126 })?;
127
128 if url.to_ascii_lowercase().ends_with(".gz") || bytes.starts_with(&[0x1f, 0x8b]) {
130 let mut decoder = GzDecoder::new(&bytes[..]);
131 let mut xml = String::new();
132 decoder.read_to_string(&mut xml).map_err(|e| {
133 StygianError::Service(ServiceError::InvalidResponse(format!(
134 "gzip decompression failed: {e}"
135 )))
136 })?;
137 Ok(xml)
138 } else {
139 String::from_utf8(bytes.to_vec()).map_err(|e| {
140 StygianError::Service(ServiceError::InvalidResponse(format!(
141 "sitemap not valid UTF-8: {e}"
142 )))
143 })
144 }
145 }
146
147 async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
153 if depth > self.max_depth {
154 return Err(StygianError::Service(ServiceError::InvalidResponse(
155 format!(
156 "sitemap index nesting exceeded max depth ({depth} > {})",
157 self.max_depth
158 ),
159 )));
160 }
161
162 let xml = self.fetch_bytes(url).await?;
163 let root_kind = detect_root_element(&xml)?;
164
165 match root_kind {
166 RootElement::UrlSet => parse_urlset(&xml),
167 RootElement::SitemapIndex => {
168 let nested_urls = parse_sitemapindex(&xml)?;
169 let mut all = Vec::new();
170 for nested_url in &nested_urls {
171 let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
172 all.extend(entries);
173 }
174 Ok(all)
175 }
176 }
177 }
178}
179
180#[async_trait]
181impl ScrapingService for SitemapAdapter {
182 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
206 let mut entries = self.resolve(&input.url, 0).await?;
207
208 if let Some(min_pri) = input
210 .params
211 .get("min_priority")
212 .and_then(serde_json::Value::as_f64)
213 {
214 entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
215 }
216 if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
217 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
218 }
219 if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
220 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
221 }
222
223 let count = entries.len();
224 let data = serde_json::to_string(&entries).map_err(|e| {
225 StygianError::Service(ServiceError::InvalidResponse(format!(
226 "sitemap serialization failed: {e}"
227 )))
228 })?;
229
230 Ok(ServiceOutput {
231 data,
232 metadata: json!({
233 "source": "sitemap",
234 "url_count": count,
235 "source_url": input.url,
236 }),
237 })
238 }
239
240 fn name(&self) -> &'static str {
241 "sitemap"
242 }
243}
244
245#[derive(Debug, PartialEq)]
248enum RootElement {
249 UrlSet,
250 SitemapIndex,
251}
252
253fn detect_root_element(xml: &str) -> Result<RootElement> {
255 let mut reader = Reader::from_str(xml);
256 let mut buf = Vec::new();
257
258 loop {
259 match reader.read_event_into(&mut buf) {
260 Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
261 let local = e.local_name();
262 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
263 return match name {
264 "urlset" => Ok(RootElement::UrlSet),
265 "sitemapindex" => Ok(RootElement::SitemapIndex),
266 _ => Err(StygianError::Service(ServiceError::InvalidResponse(
267 format!("unexpected XML root element: <{name}>"),
268 ))),
269 };
270 }
271 Ok(Event::Eof) => {
272 return Err(StygianError::Service(ServiceError::InvalidResponse(
273 "empty or invalid XML document".into(),
274 )));
275 }
276 Err(e) => {
277 return Err(StygianError::Service(ServiceError::InvalidResponse(
278 format!("XML parse error: {e}"),
279 )));
280 }
281 _ => {} }
283 buf.clear();
284 }
285}
286
287fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
289 let mut reader = Reader::from_str(xml);
290 let mut buf = Vec::new();
291 let mut entries = Vec::new();
292
293 let mut current: Option<SitemapEntryBuilder> = None;
295 let mut current_tag: Option<String> = None;
296
297 loop {
298 match reader.read_event_into(&mut buf) {
299 Ok(Event::Start(ref e)) => {
300 let name = local_name(e);
301 match name.as_str() {
302 "url" => {
303 current = Some(SitemapEntryBuilder::default());
304 }
305 "loc" | "lastmod" | "changefreq" | "priority" => {
306 current_tag = Some(name);
307 }
308 _ => {}
309 }
310 }
311 Ok(Event::Text(ref t)) => {
312 if let (Some(builder), Some(tag)) = (&mut current, ¤t_tag) {
313 let text = t.xml10_content().unwrap_or_default().trim().to_string();
314 if !text.is_empty() {
315 match tag.as_str() {
316 "loc" => builder.loc = Some(text),
317 "lastmod" => builder.lastmod = Some(text),
318 "changefreq" => builder.changefreq = Some(text),
319 "priority" => builder.priority = text.parse().ok(),
320 _ => {}
321 }
322 }
323 }
324 }
325 Ok(Event::End(ref e)) => {
326 let name = local_name_end(e);
327 if name == "url"
328 && let Some(builder) = current.take()
329 && let Some(entry) = builder.build()
330 {
331 entries.push(entry);
332 }
333 if current_tag.as_deref() == Some(&name) {
334 current_tag = None;
335 }
336 }
337 Ok(Event::Eof) => break,
338 Err(e) => {
339 return Err(StygianError::Service(ServiceError::InvalidResponse(
340 format!("sitemap XML parse error: {e}"),
341 )));
342 }
343 _ => {}
344 }
345 buf.clear();
346 }
347
348 Ok(entries)
349}
350
351fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
353 let mut reader = Reader::from_str(xml);
354 let mut buf = Vec::new();
355 let mut urls = Vec::new();
356 let mut in_sitemap = false;
357 let mut in_loc = false;
358
359 loop {
360 match reader.read_event_into(&mut buf) {
361 Ok(Event::Start(ref e)) => {
362 let name = local_name(e);
363 match name.as_str() {
364 "sitemap" => in_sitemap = true,
365 "loc" if in_sitemap => in_loc = true,
366 _ => {}
367 }
368 }
369 Ok(Event::Text(ref t)) if in_loc => {
370 let text = t.xml10_content().unwrap_or_default().trim().to_string();
371 if !text.is_empty() {
372 urls.push(text);
373 }
374 }
375 Ok(Event::End(ref e)) => {
376 let name = local_name_end(e);
377 match name.as_str() {
378 "sitemap" => {
379 in_sitemap = false;
380 in_loc = false;
381 }
382 "loc" => in_loc = false,
383 _ => {}
384 }
385 }
386 Ok(Event::Eof) => break,
387 Err(e) => {
388 return Err(StygianError::Service(ServiceError::InvalidResponse(
389 format!("sitemapindex XML parse error: {e}"),
390 )));
391 }
392 _ => {}
393 }
394 buf.clear();
395 }
396
397 Ok(urls)
398}
399
400fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
402 std::str::from_utf8(e.local_name().as_ref())
403 .unwrap_or("")
404 .to_string()
405}
406
407fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
409 std::str::from_utf8(e.local_name().as_ref())
410 .unwrap_or("")
411 .to_string()
412}
413
414#[derive(Default)]
417struct SitemapEntryBuilder {
418 loc: Option<String>,
419 lastmod: Option<String>,
420 changefreq: Option<String>,
421 priority: Option<f64>,
422}
423
424impl SitemapEntryBuilder {
425 fn build(self) -> Option<SitemapEntry> {
426 Some(SitemapEntry {
427 loc: self.loc?,
428 lastmod: self.lastmod,
429 changefreq: self.changefreq,
430 priority: self.priority,
431 })
432 }
433}
434
435#[cfg(test)]
438mod tests {
439 use super::*;
440
441 const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
442<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
443 <url>
444 <loc>https://example.com/page1</loc>
445 <lastmod>2026-03-01</lastmod>
446 <changefreq>daily</changefreq>
447 <priority>0.8</priority>
448 </url>
449 <url>
450 <loc>https://example.com/page2</loc>
451 <lastmod>2026-02-15</lastmod>
452 <priority>0.5</priority>
453 </url>
454 <url>
455 <loc>https://example.com/page3</loc>
456 </url>
457</urlset>"#;
458
459 const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
460<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
461 <sitemap>
462 <loc>https://example.com/sitemap1.xml</loc>
463 <lastmod>2026-03-01</lastmod>
464 </sitemap>
465 <sitemap>
466 <loc>https://example.com/sitemap2.xml.gz</loc>
467 </sitemap>
468</sitemapindex>"#;
469
470 #[test]
471 fn parse_urlset_with_3_urls() -> std::result::Result<(), Box<dyn std::error::Error>> {
472 let entries = parse_urlset(URLSET_XML)?;
473 assert_eq!(entries.len(), 3);
474
475 let first = entries.first().ok_or("missing first entry")?;
476 assert_eq!(first.loc, "https://example.com/page1");
477 assert_eq!(first.lastmod.as_deref(), Some("2026-03-01"));
478 assert_eq!(first.changefreq.as_deref(), Some("daily"));
479 assert_eq!(first.priority, Some(0.8));
480
481 let second = entries.get(1).ok_or("missing second entry")?;
482 assert_eq!(second.loc, "https://example.com/page2");
483 assert_eq!(second.priority, Some(0.5));
484 assert!(second.changefreq.is_none());
485
486 let third = entries.get(2).ok_or("missing third entry")?;
487 assert_eq!(third.loc, "https://example.com/page3");
488 assert!(third.lastmod.is_none());
489 assert!(third.priority.is_none());
490
491 Ok(())
492 }
493
494 #[test]
495 fn parse_sitemapindex_extracts_nested_urls()
496 -> std::result::Result<(), Box<dyn std::error::Error>> {
497 let urls = parse_sitemapindex(SITEMAPINDEX_XML)?;
498 assert_eq!(urls.len(), 2);
499 assert_eq!(
500 urls.first().map(String::as_str),
501 Some("https://example.com/sitemap1.xml")
502 );
503 assert_eq!(
504 urls.get(1).map(String::as_str),
505 Some("https://example.com/sitemap2.xml.gz")
506 );
507 Ok(())
508 }
509
510 #[test]
511 fn detect_root_urlset() -> std::result::Result<(), Box<dyn std::error::Error>> {
512 let root = detect_root_element(URLSET_XML)?;
513 assert_eq!(root, RootElement::UrlSet);
514 Ok(())
515 }
516
517 #[test]
518 fn detect_root_sitemapindex() -> std::result::Result<(), Box<dyn std::error::Error>> {
519 let root = detect_root_element(SITEMAPINDEX_XML)?;
520 assert_eq!(root, RootElement::SitemapIndex);
521 Ok(())
522 }
523
524 #[test]
525 fn filter_by_lastmod_range() -> std::result::Result<(), Box<dyn std::error::Error>> {
526 let mut entries = parse_urlset(URLSET_XML)?;
527 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
529 assert_eq!(entries.len(), 1);
530 assert_eq!(
531 entries.first().map(|entry| entry.loc.as_str()),
532 Some("https://example.com/page1")
533 );
534 Ok(())
535 }
536
537 #[test]
538 fn filter_by_priority_threshold() -> std::result::Result<(), Box<dyn std::error::Error>> {
539 let mut entries = parse_urlset(URLSET_XML)?;
540 entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
541 assert_eq!(entries.len(), 1);
542 assert_eq!(
543 entries.first().map(|entry| entry.loc.as_str()),
544 Some("https://example.com/page1")
545 );
546 Ok(())
547 }
548
549 #[test]
550 fn gzip_decompression() -> std::result::Result<(), Box<dyn std::error::Error>> {
551 use flate2::Compression;
552 use flate2::write::GzEncoder;
553 use std::io::Write;
554
555 let xml = URLSET_XML;
556 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
557 encoder.write_all(xml.as_bytes())?;
558 let compressed = encoder.finish()?;
559
560 let mut decoder = GzDecoder::new(&compressed[..]);
562 let mut decompressed = String::new();
563 decoder.read_to_string(&mut decompressed)?;
564
565 let entries = parse_urlset(&decompressed)?;
566 assert_eq!(entries.len(), 3);
567 Ok(())
568 }
569
570 #[test]
571 fn malformed_xml_returns_error() {
572 let bad = "<not-a-sitemap><broken";
573 let result = detect_root_element(bad);
574 assert!(result.is_err());
575 }
576
577 #[test]
578 fn empty_xml_returns_error() {
579 let result = detect_root_element("");
580 assert!(result.is_err());
581 }
582
583 #[test]
584 fn unexpected_root_element_returns_error() {
585 let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
586 let result = detect_root_element(xml);
587 assert!(result.is_err());
588 }
589
590 #[test]
591 fn urlset_with_no_urls_returns_empty() -> std::result::Result<(), Box<dyn std::error::Error>> {
592 let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
593 let entries = parse_urlset(xml)?;
594 assert!(entries.is_empty());
595 Ok(())
596 }
597
598 #[test]
599 fn url_without_loc_is_skipped() -> std::result::Result<(), Box<dyn std::error::Error>> {
600 let xml = r#"<?xml version="1.0"?>
601<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
602 <url>
603 <lastmod>2026-01-01</lastmod>
604 </url>
605 <url>
606 <loc>https://example.com/valid</loc>
607 </url>
608</urlset>"#;
609 let entries = parse_urlset(xml)?;
610 assert_eq!(entries.len(), 1);
611 assert_eq!(
612 entries.first().map(|entry| entry.loc.as_str()),
613 Some("https://example.com/valid")
614 );
615 Ok(())
616 }
617}