1use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58 pub loc: String,
60 pub lastmod: Option<String>,
62 pub changefreq: Option<String>,
64 pub priority: Option<f64>,
66}
67
68pub struct SitemapAdapter {
83 client: reqwest::Client,
84 max_depth: usize,
85}
86
87impl SitemapAdapter {
88 pub const fn new(client: reqwest::Client, max_depth: usize) -> Self {
100 Self { client, max_depth }
101 }
102
103 async fn fetch_bytes(&self, url: &str) -> Result<String> {
109 let resp = self.client.get(url).send().await.map_err(|e| {
110 StygianError::Service(ServiceError::Unavailable(format!(
111 "sitemap fetch failed: {e}"
112 )))
113 })?;
114
115 if !resp.status().is_success() {
116 return Err(StygianError::Service(ServiceError::InvalidResponse(
117 format!("sitemap returned HTTP {}", resp.status()),
118 )));
119 }
120
121 let bytes = resp.bytes().await.map_err(|e| {
122 StygianError::Service(ServiceError::Unavailable(format!(
123 "sitemap body read failed: {e}"
124 )))
125 })?;
126
127 if url.to_ascii_lowercase().ends_with(".gz") || bytes.starts_with(&[0x1f, 0x8b]) {
129 let mut decoder = GzDecoder::new(&bytes[..]);
130 let mut xml = String::new();
131 decoder.read_to_string(&mut xml).map_err(|e| {
132 StygianError::Service(ServiceError::InvalidResponse(format!(
133 "gzip decompression failed: {e}"
134 )))
135 })?;
136 Ok(xml)
137 } else {
138 String::from_utf8(bytes.to_vec()).map_err(|e| {
139 StygianError::Service(ServiceError::InvalidResponse(format!(
140 "sitemap not valid UTF-8: {e}"
141 )))
142 })
143 }
144 }
145
146 async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
152 if depth > self.max_depth {
153 return Err(StygianError::Service(ServiceError::InvalidResponse(
154 format!(
155 "sitemap index nesting exceeded max depth ({depth} > {})",
156 self.max_depth
157 ),
158 )));
159 }
160
161 let xml = self.fetch_bytes(url).await?;
162 let root_kind = detect_root_element(&xml)?;
163
164 match root_kind {
165 RootElement::UrlSet => parse_urlset(&xml),
166 RootElement::SitemapIndex => {
167 let nested_urls = parse_sitemapindex(&xml)?;
168 let mut all = Vec::new();
169 for nested_url in &nested_urls {
170 let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
171 all.extend(entries);
172 }
173 Ok(all)
174 }
175 }
176 }
177}
178
179#[async_trait]
180impl ScrapingService for SitemapAdapter {
181 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
205 let mut entries = self.resolve(&input.url, 0).await?;
206
207 if let Some(min_pri) = input
209 .params
210 .get("min_priority")
211 .and_then(serde_json::Value::as_f64)
212 {
213 entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
214 }
215 if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
216 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
217 }
218 if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
219 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
220 }
221
222 let count = entries.len();
223 let data = serde_json::to_string(&entries).map_err(|e| {
224 StygianError::Service(ServiceError::InvalidResponse(format!(
225 "sitemap serialization failed: {e}"
226 )))
227 })?;
228
229 Ok(ServiceOutput {
230 data,
231 metadata: json!({
232 "source": "sitemap",
233 "url_count": count,
234 "source_url": input.url,
235 }),
236 })
237 }
238
239 fn name(&self) -> &'static str {
240 "sitemap"
241 }
242}
243
244#[derive(Debug, PartialEq)]
247enum RootElement {
248 UrlSet,
249 SitemapIndex,
250}
251
252fn detect_root_element(xml: &str) -> Result<RootElement> {
254 let mut reader = Reader::from_str(xml);
255 let mut buf = Vec::new();
256
257 loop {
258 match reader.read_event_into(&mut buf) {
259 Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
260 let local = e.local_name();
261 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
262 return match name {
263 "urlset" => Ok(RootElement::UrlSet),
264 "sitemapindex" => Ok(RootElement::SitemapIndex),
265 _ => Err(StygianError::Service(ServiceError::InvalidResponse(
266 format!("unexpected XML root element: <{name}>"),
267 ))),
268 };
269 }
270 Ok(Event::Eof) => {
271 return Err(StygianError::Service(ServiceError::InvalidResponse(
272 "empty or invalid XML document".into(),
273 )));
274 }
275 Err(e) => {
276 return Err(StygianError::Service(ServiceError::InvalidResponse(
277 format!("XML parse error: {e}"),
278 )));
279 }
280 _ => {} }
282 buf.clear();
283 }
284}
285
286fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
288 let mut reader = Reader::from_str(xml);
289 let mut buf = Vec::new();
290 let mut entries = Vec::new();
291
292 let mut current: Option<SitemapEntryBuilder> = None;
294 let mut current_tag: Option<String> = None;
295
296 loop {
297 match reader.read_event_into(&mut buf) {
298 Ok(Event::Start(ref e)) => {
299 let name = local_name(e);
300 match name.as_str() {
301 "url" => {
302 current = Some(SitemapEntryBuilder::default());
303 }
304 "loc" | "lastmod" | "changefreq" | "priority" => {
305 current_tag = Some(name);
306 }
307 _ => {}
308 }
309 }
310 Ok(Event::Text(ref t)) => {
311 if let (Some(builder), Some(tag)) = (&mut current, ¤t_tag) {
312 let text = t.unescape().unwrap_or_default().trim().to_string();
313 if !text.is_empty() {
314 match tag.as_str() {
315 "loc" => builder.loc = Some(text),
316 "lastmod" => builder.lastmod = Some(text),
317 "changefreq" => builder.changefreq = Some(text),
318 "priority" => builder.priority = text.parse().ok(),
319 _ => {}
320 }
321 }
322 }
323 }
324 Ok(Event::End(ref e)) => {
325 let name = local_name_end(e);
326 if name == "url"
327 && let Some(builder) = current.take()
328 && let Some(entry) = builder.build()
329 {
330 entries.push(entry);
331 }
332 if current_tag.as_deref() == Some(&name) {
333 current_tag = None;
334 }
335 }
336 Ok(Event::Eof) => break,
337 Err(e) => {
338 return Err(StygianError::Service(ServiceError::InvalidResponse(
339 format!("sitemap XML parse error: {e}"),
340 )));
341 }
342 _ => {}
343 }
344 buf.clear();
345 }
346
347 Ok(entries)
348}
349
350fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
352 let mut reader = Reader::from_str(xml);
353 let mut buf = Vec::new();
354 let mut urls = Vec::new();
355 let mut in_sitemap = false;
356 let mut in_loc = false;
357
358 loop {
359 match reader.read_event_into(&mut buf) {
360 Ok(Event::Start(ref e)) => {
361 let name = local_name(e);
362 match name.as_str() {
363 "sitemap" => in_sitemap = true,
364 "loc" if in_sitemap => in_loc = true,
365 _ => {}
366 }
367 }
368 Ok(Event::Text(ref t)) => {
369 if in_loc {
370 let text = t.unescape().unwrap_or_default().trim().to_string();
371 if !text.is_empty() {
372 urls.push(text);
373 }
374 }
375 }
376 Ok(Event::End(ref e)) => {
377 let name = local_name_end(e);
378 match name.as_str() {
379 "sitemap" => {
380 in_sitemap = false;
381 in_loc = false;
382 }
383 "loc" => in_loc = false,
384 _ => {}
385 }
386 }
387 Ok(Event::Eof) => break,
388 Err(e) => {
389 return Err(StygianError::Service(ServiceError::InvalidResponse(
390 format!("sitemapindex XML parse error: {e}"),
391 )));
392 }
393 _ => {}
394 }
395 buf.clear();
396 }
397
398 Ok(urls)
399}
400
401fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
403 std::str::from_utf8(e.local_name().as_ref())
404 .unwrap_or("")
405 .to_string()
406}
407
408fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
410 std::str::from_utf8(e.local_name().as_ref())
411 .unwrap_or("")
412 .to_string()
413}
414
415#[derive(Default)]
418struct SitemapEntryBuilder {
419 loc: Option<String>,
420 lastmod: Option<String>,
421 changefreq: Option<String>,
422 priority: Option<f64>,
423}
424
425impl SitemapEntryBuilder {
426 fn build(self) -> Option<SitemapEntry> {
427 Some(SitemapEntry {
428 loc: self.loc?,
429 lastmod: self.lastmod,
430 changefreq: self.changefreq,
431 priority: self.priority,
432 })
433 }
434}
435
436#[cfg(test)]
439mod tests {
440 use super::*;
441
442 const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
443<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
444 <url>
445 <loc>https://example.com/page1</loc>
446 <lastmod>2026-03-01</lastmod>
447 <changefreq>daily</changefreq>
448 <priority>0.8</priority>
449 </url>
450 <url>
451 <loc>https://example.com/page2</loc>
452 <lastmod>2026-02-15</lastmod>
453 <priority>0.5</priority>
454 </url>
455 <url>
456 <loc>https://example.com/page3</loc>
457 </url>
458</urlset>"#;
459
460 const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
461<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
462 <sitemap>
463 <loc>https://example.com/sitemap1.xml</loc>
464 <lastmod>2026-03-01</lastmod>
465 </sitemap>
466 <sitemap>
467 <loc>https://example.com/sitemap2.xml.gz</loc>
468 </sitemap>
469</sitemapindex>"#;
470
471 #[test]
472 fn parse_urlset_with_3_urls() -> std::result::Result<(), Box<dyn std::error::Error>> {
473 let entries = parse_urlset(URLSET_XML)?;
474 assert_eq!(entries.len(), 3);
475
476 let first = entries.first().ok_or("missing first entry")?;
477 assert_eq!(first.loc, "https://example.com/page1");
478 assert_eq!(first.lastmod.as_deref(), Some("2026-03-01"));
479 assert_eq!(first.changefreq.as_deref(), Some("daily"));
480 assert_eq!(first.priority, Some(0.8));
481
482 let second = entries.get(1).ok_or("missing second entry")?;
483 assert_eq!(second.loc, "https://example.com/page2");
484 assert_eq!(second.priority, Some(0.5));
485 assert!(second.changefreq.is_none());
486
487 let third = entries.get(2).ok_or("missing third entry")?;
488 assert_eq!(third.loc, "https://example.com/page3");
489 assert!(third.lastmod.is_none());
490 assert!(third.priority.is_none());
491
492 Ok(())
493 }
494
495 #[test]
496 fn parse_sitemapindex_extracts_nested_urls()
497 -> std::result::Result<(), Box<dyn std::error::Error>> {
498 let urls = parse_sitemapindex(SITEMAPINDEX_XML)?;
499 assert_eq!(urls.len(), 2);
500 assert_eq!(
501 urls.first().map(String::as_str),
502 Some("https://example.com/sitemap1.xml")
503 );
504 assert_eq!(
505 urls.get(1).map(String::as_str),
506 Some("https://example.com/sitemap2.xml.gz")
507 );
508 Ok(())
509 }
510
511 #[test]
512 fn detect_root_urlset() -> std::result::Result<(), Box<dyn std::error::Error>> {
513 let root = detect_root_element(URLSET_XML)?;
514 assert_eq!(root, RootElement::UrlSet);
515 Ok(())
516 }
517
518 #[test]
519 fn detect_root_sitemapindex() -> std::result::Result<(), Box<dyn std::error::Error>> {
520 let root = detect_root_element(SITEMAPINDEX_XML)?;
521 assert_eq!(root, RootElement::SitemapIndex);
522 Ok(())
523 }
524
525 #[test]
526 fn filter_by_lastmod_range() -> std::result::Result<(), Box<dyn std::error::Error>> {
527 let mut entries = parse_urlset(URLSET_XML)?;
528 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
530 assert_eq!(entries.len(), 1);
531 assert_eq!(
532 entries.first().map(|entry| entry.loc.as_str()),
533 Some("https://example.com/page1")
534 );
535 Ok(())
536 }
537
538 #[test]
539 fn filter_by_priority_threshold() -> std::result::Result<(), Box<dyn std::error::Error>> {
540 let mut entries = parse_urlset(URLSET_XML)?;
541 entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
542 assert_eq!(entries.len(), 1);
543 assert_eq!(
544 entries.first().map(|entry| entry.loc.as_str()),
545 Some("https://example.com/page1")
546 );
547 Ok(())
548 }
549
550 #[test]
551 fn gzip_decompression() -> std::result::Result<(), Box<dyn std::error::Error>> {
552 use flate2::Compression;
553 use flate2::write::GzEncoder;
554 use std::io::Write;
555
556 let xml = URLSET_XML;
557 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
558 encoder.write_all(xml.as_bytes())?;
559 let compressed = encoder.finish()?;
560
561 let mut decoder = GzDecoder::new(&compressed[..]);
563 let mut decompressed = String::new();
564 decoder.read_to_string(&mut decompressed)?;
565
566 let entries = parse_urlset(&decompressed)?;
567 assert_eq!(entries.len(), 3);
568 Ok(())
569 }
570
571 #[test]
572 fn malformed_xml_returns_error() {
573 let bad = "<not-a-sitemap><broken";
574 let result = detect_root_element(bad);
575 assert!(result.is_err());
576 }
577
578 #[test]
579 fn empty_xml_returns_error() {
580 let result = detect_root_element("");
581 assert!(result.is_err());
582 }
583
584 #[test]
585 fn unexpected_root_element_returns_error() {
586 let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
587 let result = detect_root_element(xml);
588 assert!(result.is_err());
589 }
590
591 #[test]
592 fn urlset_with_no_urls_returns_empty() -> std::result::Result<(), Box<dyn std::error::Error>> {
593 let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
594 let entries = parse_urlset(xml)?;
595 assert!(entries.is_empty());
596 Ok(())
597 }
598
599 #[test]
600 fn url_without_loc_is_skipped() -> std::result::Result<(), Box<dyn std::error::Error>> {
601 let xml = r#"<?xml version="1.0"?>
602<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
603 <url>
604 <lastmod>2026-01-01</lastmod>
605 </url>
606 <url>
607 <loc>https://example.com/valid</loc>
608 </url>
609</urlset>"#;
610 let entries = parse_urlset(xml)?;
611 assert_eq!(entries.len(), 1);
612 assert_eq!(
613 entries.first().map(|entry| entry.loc.as_str()),
614 Some("https://example.com/valid")
615 );
616 Ok(())
617 }
618}