stygian_graph/adapters/
sitemap.rs1use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
32use async_trait::async_trait;
33use flate2::read::GzDecoder;
34use quick_xml::Reader;
35use quick_xml::events::Event;
36use serde::{Deserialize, Serialize};
37use serde_json::json;
38use std::io::Read;
39
40#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
57pub struct SitemapEntry {
58 pub loc: String,
60 pub lastmod: Option<String>,
62 pub changefreq: Option<String>,
64 pub priority: Option<f64>,
66}
67
68pub struct SitemapAdapter {
83 client: reqwest::Client,
84 max_depth: usize,
85}
86
87impl SitemapAdapter {
88 pub fn new(client: reqwest::Client, max_depth: usize) -> Self {
100 Self { client, max_depth }
101 }
102
103 async fn fetch_bytes(&self, url: &str) -> Result<String> {
109 let resp = self.client.get(url).send().await.map_err(|e| {
110 StygianError::Service(ServiceError::Unavailable(format!(
111 "sitemap fetch failed: {e}"
112 )))
113 })?;
114
115 if !resp.status().is_success() {
116 return Err(StygianError::Service(ServiceError::InvalidResponse(
117 format!("sitemap returned HTTP {}", resp.status()),
118 )));
119 }
120
121 let bytes = resp.bytes().await.map_err(|e| {
122 StygianError::Service(ServiceError::Unavailable(format!(
123 "sitemap body read failed: {e}"
124 )))
125 })?;
126
127 if url.ends_with(".gz") || (bytes.len() >= 2 && bytes[0] == 0x1f && bytes[1] == 0x8b) {
129 let mut decoder = GzDecoder::new(&bytes[..]);
130 let mut xml = String::new();
131 decoder.read_to_string(&mut xml).map_err(|e| {
132 StygianError::Service(ServiceError::InvalidResponse(format!(
133 "gzip decompression failed: {e}"
134 )))
135 })?;
136 Ok(xml)
137 } else {
138 String::from_utf8(bytes.to_vec()).map_err(|e| {
139 StygianError::Service(ServiceError::InvalidResponse(format!(
140 "sitemap not valid UTF-8: {e}"
141 )))
142 })
143 }
144 }
145
146 async fn resolve(&self, url: &str, depth: usize) -> Result<Vec<SitemapEntry>> {
152 if depth > self.max_depth {
153 return Err(StygianError::Service(ServiceError::InvalidResponse(
154 format!(
155 "sitemap index nesting exceeded max depth ({depth} > {})",
156 self.max_depth
157 ),
158 )));
159 }
160
161 let xml = self.fetch_bytes(url).await?;
162 let root_kind = detect_root_element(&xml)?;
163
164 match root_kind {
165 RootElement::UrlSet => parse_urlset(&xml),
166 RootElement::SitemapIndex => {
167 let nested_urls = parse_sitemapindex(&xml)?;
168 let mut all = Vec::new();
169 for nested_url in &nested_urls {
170 let entries = Box::pin(self.resolve(nested_url, depth + 1)).await?;
171 all.extend(entries);
172 }
173 Ok(all)
174 }
175 }
176 }
177}
178
179#[async_trait]
180impl ScrapingService for SitemapAdapter {
181 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
205 let mut entries = self.resolve(&input.url, 0).await?;
206
207 if let Some(min_pri) = input.params.get("min_priority").and_then(|v| v.as_f64()) {
209 entries.retain(|e| e.priority.unwrap_or(0.0) >= min_pri);
210 }
211 if let Some(after) = input.params.get("lastmod_after").and_then(|v| v.as_str()) {
212 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= after));
213 }
214 if let Some(before) = input.params.get("lastmod_before").and_then(|v| v.as_str()) {
215 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm <= before));
216 }
217
218 let count = entries.len();
219 let data = serde_json::to_string(&entries).map_err(|e| {
220 StygianError::Service(ServiceError::InvalidResponse(format!(
221 "sitemap serialization failed: {e}"
222 )))
223 })?;
224
225 Ok(ServiceOutput {
226 data,
227 metadata: json!({
228 "source": "sitemap",
229 "url_count": count,
230 "source_url": input.url,
231 }),
232 })
233 }
234
235 fn name(&self) -> &'static str {
236 "sitemap"
237 }
238}
239
240#[derive(Debug, PartialEq)]
243enum RootElement {
244 UrlSet,
245 SitemapIndex,
246}
247
248fn detect_root_element(xml: &str) -> Result<RootElement> {
250 let mut reader = Reader::from_str(xml);
251 let mut buf = Vec::new();
252
253 loop {
254 match reader.read_event_into(&mut buf) {
255 Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
256 let local = e.local_name();
257 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
258 return match name {
259 "urlset" => Ok(RootElement::UrlSet),
260 "sitemapindex" => Ok(RootElement::SitemapIndex),
261 _ => Err(StygianError::Service(ServiceError::InvalidResponse(
262 format!("unexpected XML root element: <{name}>"),
263 ))),
264 };
265 }
266 Ok(Event::Eof) => {
267 return Err(StygianError::Service(ServiceError::InvalidResponse(
268 "empty or invalid XML document".into(),
269 )));
270 }
271 Err(e) => {
272 return Err(StygianError::Service(ServiceError::InvalidResponse(
273 format!("XML parse error: {e}"),
274 )));
275 }
276 _ => {} }
278 buf.clear();
279 }
280}
281
282fn parse_urlset(xml: &str) -> Result<Vec<SitemapEntry>> {
284 let mut reader = Reader::from_str(xml);
285 let mut buf = Vec::new();
286 let mut entries = Vec::new();
287
288 let mut current: Option<SitemapEntryBuilder> = None;
290 let mut current_tag: Option<String> = None;
291
292 loop {
293 match reader.read_event_into(&mut buf) {
294 Ok(Event::Start(ref e)) => {
295 let name = local_name(e);
296 match name.as_str() {
297 "url" => {
298 current = Some(SitemapEntryBuilder::default());
299 }
300 "loc" | "lastmod" | "changefreq" | "priority" => {
301 current_tag = Some(name);
302 }
303 _ => {}
304 }
305 }
306 Ok(Event::Text(ref t)) => {
307 if let (Some(builder), Some(tag)) = (&mut current, ¤t_tag) {
308 let text = t.unescape().unwrap_or_default().trim().to_string();
309 if !text.is_empty() {
310 match tag.as_str() {
311 "loc" => builder.loc = Some(text),
312 "lastmod" => builder.lastmod = Some(text),
313 "changefreq" => builder.changefreq = Some(text),
314 "priority" => builder.priority = text.parse().ok(),
315 _ => {}
316 }
317 }
318 }
319 }
320 Ok(Event::End(ref e)) => {
321 let name = local_name_end(e);
322 if name == "url"
323 && let Some(builder) = current.take()
324 && let Some(entry) = builder.build()
325 {
326 entries.push(entry);
327 }
328 if current_tag.as_deref() == Some(&name) {
329 current_tag = None;
330 }
331 }
332 Ok(Event::Eof) => break,
333 Err(e) => {
334 return Err(StygianError::Service(ServiceError::InvalidResponse(
335 format!("sitemap XML parse error: {e}"),
336 )));
337 }
338 _ => {}
339 }
340 buf.clear();
341 }
342
343 Ok(entries)
344}
345
346fn parse_sitemapindex(xml: &str) -> Result<Vec<String>> {
348 let mut reader = Reader::from_str(xml);
349 let mut buf = Vec::new();
350 let mut urls = Vec::new();
351 let mut in_sitemap = false;
352 let mut in_loc = false;
353
354 loop {
355 match reader.read_event_into(&mut buf) {
356 Ok(Event::Start(ref e)) => {
357 let name = local_name(e);
358 match name.as_str() {
359 "sitemap" => in_sitemap = true,
360 "loc" if in_sitemap => in_loc = true,
361 _ => {}
362 }
363 }
364 Ok(Event::Text(ref t)) => {
365 if in_loc {
366 let text = t.unescape().unwrap_or_default().trim().to_string();
367 if !text.is_empty() {
368 urls.push(text);
369 }
370 }
371 }
372 Ok(Event::End(ref e)) => {
373 let name = local_name_end(e);
374 match name.as_str() {
375 "sitemap" => {
376 in_sitemap = false;
377 in_loc = false;
378 }
379 "loc" => in_loc = false,
380 _ => {}
381 }
382 }
383 Ok(Event::Eof) => break,
384 Err(e) => {
385 return Err(StygianError::Service(ServiceError::InvalidResponse(
386 format!("sitemapindex XML parse error: {e}"),
387 )));
388 }
389 _ => {}
390 }
391 buf.clear();
392 }
393
394 Ok(urls)
395}
396
397fn local_name(e: &quick_xml::events::BytesStart<'_>) -> String {
399 std::str::from_utf8(e.local_name().as_ref())
400 .unwrap_or("")
401 .to_string()
402}
403
404fn local_name_end(e: &quick_xml::events::BytesEnd<'_>) -> String {
406 std::str::from_utf8(e.local_name().as_ref())
407 .unwrap_or("")
408 .to_string()
409}
410
411#[derive(Default)]
414struct SitemapEntryBuilder {
415 loc: Option<String>,
416 lastmod: Option<String>,
417 changefreq: Option<String>,
418 priority: Option<f64>,
419}
420
421impl SitemapEntryBuilder {
422 fn build(self) -> Option<SitemapEntry> {
423 Some(SitemapEntry {
424 loc: self.loc?,
425 lastmod: self.lastmod,
426 changefreq: self.changefreq,
427 priority: self.priority,
428 })
429 }
430}
431
432#[cfg(test)]
435mod tests {
436 use super::*;
437
438 const URLSET_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
439<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
440 <url>
441 <loc>https://example.com/page1</loc>
442 <lastmod>2026-03-01</lastmod>
443 <changefreq>daily</changefreq>
444 <priority>0.8</priority>
445 </url>
446 <url>
447 <loc>https://example.com/page2</loc>
448 <lastmod>2026-02-15</lastmod>
449 <priority>0.5</priority>
450 </url>
451 <url>
452 <loc>https://example.com/page3</loc>
453 </url>
454</urlset>"#;
455
456 const SITEMAPINDEX_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
457<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
458 <sitemap>
459 <loc>https://example.com/sitemap1.xml</loc>
460 <lastmod>2026-03-01</lastmod>
461 </sitemap>
462 <sitemap>
463 <loc>https://example.com/sitemap2.xml.gz</loc>
464 </sitemap>
465</sitemapindex>"#;
466
467 #[test]
468 fn parse_urlset_with_3_urls() {
469 let entries = parse_urlset(URLSET_XML).expect("parse");
470 assert_eq!(entries.len(), 3);
471
472 assert_eq!(entries[0].loc, "https://example.com/page1");
473 assert_eq!(entries[0].lastmod.as_deref(), Some("2026-03-01"));
474 assert_eq!(entries[0].changefreq.as_deref(), Some("daily"));
475 assert_eq!(entries[0].priority, Some(0.8));
476
477 assert_eq!(entries[1].loc, "https://example.com/page2");
478 assert_eq!(entries[1].priority, Some(0.5));
479 assert!(entries[1].changefreq.is_none());
480
481 assert_eq!(entries[2].loc, "https://example.com/page3");
482 assert!(entries[2].lastmod.is_none());
483 assert!(entries[2].priority.is_none());
484 }
485
486 #[test]
487 fn parse_sitemapindex_extracts_nested_urls() {
488 let urls = parse_sitemapindex(SITEMAPINDEX_XML).expect("parse");
489 assert_eq!(urls.len(), 2);
490 assert_eq!(urls[0], "https://example.com/sitemap1.xml");
491 assert_eq!(urls[1], "https://example.com/sitemap2.xml.gz");
492 }
493
494 #[test]
495 fn detect_root_urlset() {
496 let root = detect_root_element(URLSET_XML).expect("detect");
497 assert_eq!(root, RootElement::UrlSet);
498 }
499
500 #[test]
501 fn detect_root_sitemapindex() {
502 let root = detect_root_element(SITEMAPINDEX_XML).expect("detect");
503 assert_eq!(root, RootElement::SitemapIndex);
504 }
505
506 #[test]
507 fn filter_by_lastmod_range() {
508 let mut entries = parse_urlset(URLSET_XML).expect("parse");
509 entries.retain(|e| e.lastmod.as_deref().is_some_and(|lm| lm >= "2026-03-01"));
511 assert_eq!(entries.len(), 1);
512 assert_eq!(entries[0].loc, "https://example.com/page1");
513 }
514
515 #[test]
516 fn filter_by_priority_threshold() {
517 let mut entries = parse_urlset(URLSET_XML).expect("parse");
518 entries.retain(|e| e.priority.unwrap_or(0.0) >= 0.6);
519 assert_eq!(entries.len(), 1);
520 assert_eq!(entries[0].loc, "https://example.com/page1");
521 }
522
523 #[test]
524 fn gzip_decompression() {
525 use flate2::Compression;
526 use flate2::write::GzEncoder;
527 use std::io::Write;
528
529 let xml = URLSET_XML;
530 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
531 encoder.write_all(xml.as_bytes()).expect("gz write");
532 let compressed = encoder.finish().expect("gz finish");
533
534 let mut decoder = GzDecoder::new(&compressed[..]);
536 let mut decompressed = String::new();
537 decoder.read_to_string(&mut decompressed).expect("gz read");
538
539 let entries = parse_urlset(&decompressed).expect("parse");
540 assert_eq!(entries.len(), 3);
541 }
542
543 #[test]
544 fn malformed_xml_returns_error() {
545 let bad = "<not-a-sitemap><broken";
546 let result = detect_root_element(bad);
547 assert!(result.is_err());
548 }
549
550 #[test]
551 fn empty_xml_returns_error() {
552 let result = detect_root_element("");
553 assert!(result.is_err());
554 }
555
556 #[test]
557 fn unexpected_root_element_returns_error() {
558 let xml = r#"<?xml version="1.0"?><html><body>nope</body></html>"#;
559 let result = detect_root_element(xml);
560 assert!(result.is_err());
561 }
562
563 #[test]
564 fn urlset_with_no_urls_returns_empty() {
565 let xml = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
566 let entries = parse_urlset(xml).expect("parse");
567 assert!(entries.is_empty());
568 }
569
570 #[test]
571 fn url_without_loc_is_skipped() {
572 let xml = r#"<?xml version="1.0"?>
573<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
574 <url>
575 <lastmod>2026-01-01</lastmod>
576 </url>
577 <url>
578 <loc>https://example.com/valid</loc>
579 </url>
580</urlset>"#;
581 let entries = parse_urlset(xml).expect("parse");
582 assert_eq!(entries.len(), 1);
583 assert_eq!(entries[0].loc, "https://example.com/valid");
584 }
585}