1use scraper::{Html, Selector};
11use serde::{Deserialize, Serialize};
12use url::Url;
13
14use crate::types::ParserResult;
15
16#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
22pub struct FeedInfo {
23 pub rss_feeds: Vec<Feed>,
25 pub atom_feeds: Vec<Feed>,
27 pub json_feeds: Vec<Feed>,
29 pub sitemaps: Vec<Sitemap>,
31 pub has_feeds: bool,
33 pub has_sitemaps: bool,
35}
36
37impl FeedInfo {
38 pub fn new() -> Self {
39 Self::default()
40 }
41
42 pub fn all_feeds(&self) -> Vec<&Feed> {
44 self.rss_feeds.iter()
45 .chain(self.atom_feeds.iter())
46 .chain(self.json_feeds.iter())
47 .collect()
48 }
49
50 pub fn primary_feed(&self) -> Option<&Feed> {
52 self.atom_feeds.first()
53 .or_else(|| self.rss_feeds.first())
54 .or_else(|| self.json_feeds.first())
55 }
56
57 pub fn feed_urls(&self) -> Vec<&str> {
59 self.all_feeds().iter().map(|f| f.url.as_str()).collect()
60 }
61
62 pub fn sitemap_urls(&self) -> Vec<&str> {
64 self.sitemaps.iter().map(|s| s.url.as_str()).collect()
65 }
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
70pub struct Feed {
71 pub url: String,
73 pub title: Option<String>,
75 pub feed_type: FeedType,
77 pub mime_type: Option<String>,
79 pub language: Option<String>,
81}
82
83impl Feed {
84 pub fn new(url: String, feed_type: FeedType) -> Self {
85 Self {
86 url,
87 title: None,
88 feed_type,
89 mime_type: None,
90 language: None,
91 }
92 }
93
94 pub fn is_rss(&self) -> bool {
96 matches!(self.feed_type, FeedType::Rss | FeedType::Rss2)
97 }
98
99 pub fn is_atom(&self) -> bool {
101 matches!(self.feed_type, FeedType::Atom)
102 }
103}
104
105#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
107pub enum FeedType {
108 Rss,
110 #[default]
112 Rss2,
113 Atom,
115 Json,
117 Unknown,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
123pub struct Sitemap {
124 pub url: String,
126 pub sitemap_type: SitemapType,
128 pub source: SitemapSource,
130}
131
132impl Sitemap {
133 pub fn new(url: String, sitemap_type: SitemapType) -> Self {
134 Self {
135 url,
136 sitemap_type,
137 source: SitemapSource::LinkTag,
138 }
139 }
140}
141
142#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
144pub enum SitemapType {
145 #[default]
147 Xml,
148 Index,
150 News,
152 Image,
154 Video,
156 Text,
158 Gzip,
160}
161
162#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
164pub enum SitemapSource {
165 #[default]
167 LinkTag,
168 RobotsTxt,
170 WellKnown,
172 SitemapIndex,
174}
175
176pub fn extract_feed_info(document: &Html, base_url: Option<&Url>) -> ParserResult<FeedInfo> {
182 let mut info = FeedInfo::new();
183
184 extract_link_feeds(document, &mut info, base_url);
186
187 extract_sitemaps(document, &mut info, base_url);
189
190 info.has_feeds = !info.rss_feeds.is_empty() ||
192 !info.atom_feeds.is_empty() ||
193 !info.json_feeds.is_empty();
194 info.has_sitemaps = !info.sitemaps.is_empty();
195
196 Ok(info)
197}
198
199fn extract_link_feeds(document: &Html, info: &mut FeedInfo, base_url: Option<&Url>) {
201 let feed_selector = Selector::parse(
203 r#"link[rel="alternate"][type="application/rss+xml"],
204 link[rel="alternate"][type="application/atom+xml"],
205 link[rel="alternate"][type="application/feed+json"],
206 link[rel="alternate"][type="application/json"]"#
207 ).unwrap();
208
209 for el in document.select(&feed_selector) {
210 let href = match el.value().attr("href") {
211 Some(h) => h,
212 None => continue,
213 };
214
215 let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
216 let mime_type = el.value().attr("type").map(|s| s.to_string());
217 let title = el.value().attr("title").map(|s| s.to_string());
218 let hreflang = el.value().attr("hreflang").map(|s| s.to_string());
219
220 let feed_type = detect_feed_type(&mime_type, &url);
221
222 let mut feed = Feed::new(url, feed_type);
223 feed.title = title;
224 feed.mime_type = mime_type;
225 feed.language = hreflang;
226
227 match feed_type {
228 FeedType::Atom => info.atom_feeds.push(feed),
229 FeedType::Json => info.json_feeds.push(feed),
230 _ => info.rss_feeds.push(feed),
231 }
232 }
233
234 if let Ok(sel) = Selector::parse("a[href*='feed'], a[href*='rss'], a[href*='atom']") {
236 for el in document.select(&sel) {
237 if let Some(href) = el.value().attr("href") {
238 let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
239
240 if info.all_feeds().iter().any(|f| f.url == url) {
242 continue;
243 }
244
245 let feed_type = detect_feed_type_from_url(&url);
247 if feed_type == FeedType::Unknown {
248 continue;
249 }
250
251 let mut feed = Feed::new(url, feed_type);
252 feed.title = Some(el.text().collect::<String>().trim().to_string());
253
254 match feed_type {
255 FeedType::Atom => info.atom_feeds.push(feed),
256 FeedType::Json => info.json_feeds.push(feed),
257 _ => info.rss_feeds.push(feed),
258 }
259 }
260 }
261 }
262}
263
264fn detect_feed_type(mime_type: &Option<String>, url: &str) -> FeedType {
266 if let Some(ref mime) = mime_type {
267 match mime.as_str() {
268 "application/atom+xml" => return FeedType::Atom,
269 "application/rss+xml" => return FeedType::Rss2,
270 "application/feed+json" | "application/json" => {
271 if url.contains("feed") {
272 return FeedType::Json;
273 }
274 }
275 _ => {}
276 }
277 }
278
279 detect_feed_type_from_url(url)
280}
281
282fn detect_feed_type_from_url(url: &str) -> FeedType {
284 let url_lower = url.to_lowercase();
285
286 if url_lower.contains("atom") {
287 FeedType::Atom
288 } else if url_lower.contains("rss") || url_lower.contains("feed.xml") {
289 FeedType::Rss2
290 } else if url_lower.ends_with("feed.json") || url_lower.contains("feed/json") {
291 FeedType::Json
292 } else if url_lower.contains("feed") || url_lower.ends_with(".xml") {
293 FeedType::Rss2
294 } else {
295 FeedType::Unknown
296 }
297}
298
299fn extract_sitemaps(document: &Html, info: &mut FeedInfo, base_url: Option<&Url>) {
301 if let Ok(sel) = Selector::parse("link[rel='sitemap']") {
303 for el in document.select(&sel) {
304 if let Some(href) = el.value().attr("href") {
305 let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
306 let sitemap_type = detect_sitemap_type(&url);
307
308 let mut sitemap = Sitemap::new(url, sitemap_type);
309 sitemap.source = SitemapSource::LinkTag;
310 info.sitemaps.push(sitemap);
311 }
312 }
313 }
314
315 if let Ok(sel) = Selector::parse("a[href*='sitemap']") {
317 for el in document.select(&sel) {
318 if let Some(href) = el.value().attr("href") {
319 let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
320
321 if info.sitemaps.iter().any(|s| s.url == url) {
323 continue;
324 }
325
326 let sitemap_type = detect_sitemap_type(&url);
327 let mut sitemap = Sitemap::new(url, sitemap_type);
328 sitemap.source = SitemapSource::LinkTag;
329 info.sitemaps.push(sitemap);
330 }
331 }
332 }
333}
334
335fn detect_sitemap_type(url: &str) -> SitemapType {
337 let url_lower = url.to_lowercase();
338
339 if url_lower.ends_with(".gz") {
340 SitemapType::Gzip
341 } else if url_lower.contains("sitemap_index") || url_lower.contains("sitemap-index") {
342 SitemapType::Index
343 } else if url_lower.contains("news") {
344 SitemapType::News
345 } else if url_lower.contains("image") {
346 SitemapType::Image
347 } else if url_lower.contains("video") {
348 SitemapType::Video
349 } else if url_lower.ends_with(".txt") {
350 SitemapType::Text
351 } else {
352 SitemapType::Xml
353 }
354}
355
356fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
358 if href.starts_with("http://") || href.starts_with("https://") {
359 return Some(href.to_string());
360 }
361
362 if href.starts_with("//") {
363 return Some(format!("https:{}", href));
364 }
365
366 if let Some(base) = base_url {
367 return base.join(href).ok().map(|u| u.to_string());
368 }
369
370 None
371}
372
373pub const COMMON_FEED_PATHS: &[&str] = &[
379 "/feed",
380 "/feed/",
381 "/feed.xml",
382 "/feed.rss",
383 "/rss",
384 "/rss/",
385 "/rss.xml",
386 "/atom.xml",
387 "/atom",
388 "/feed.atom",
389 "/feeds/posts/default",
390 "/blog/feed",
391 "/blog/rss",
392 "/index.xml",
393 "/.rss",
394 "/feed.json",
395];
396
397pub const COMMON_SITEMAP_PATHS: &[&str] = &[
399 "/sitemap.xml",
400 "/sitemap_index.xml",
401 "/sitemap",
402 "/sitemaps.xml",
403 "/sitemap1.xml",
404 "/sitemap-index.xml",
405 "/post-sitemap.xml",
406 "/page-sitemap.xml",
407 "/news-sitemap.xml",
408 "/sitemap.xml.gz",
409];
410
411pub fn generate_feed_urls(base_url: &Url) -> Vec<String> {
413 COMMON_FEED_PATHS.iter()
414 .filter_map(|path| base_url.join(path).ok())
415 .map(|u| u.to_string())
416 .collect()
417}
418
419pub fn generate_sitemap_urls(base_url: &Url) -> Vec<String> {
421 COMMON_SITEMAP_PATHS.iter()
422 .filter_map(|path| base_url.join(path).ok())
423 .map(|u| u.to_string())
424 .collect()
425}
426
427pub fn has_feeds(document: &Html) -> bool {
433 extract_feed_info(document, None)
434 .map(|i| i.has_feeds)
435 .unwrap_or(false)
436}
437
438pub fn get_rss_feed(document: &Html, base_url: Option<&Url>) -> Option<String> {
440 extract_feed_info(document, base_url)
441 .ok()
442 .and_then(|i| i.rss_feeds.first().map(|f| f.url.clone()))
443}
444
445pub fn get_atom_feed(document: &Html, base_url: Option<&Url>) -> Option<String> {
447 extract_feed_info(document, base_url)
448 .ok()
449 .and_then(|i| i.atom_feeds.first().map(|f| f.url.clone()))
450}
451
452pub fn get_feed(document: &Html, base_url: Option<&Url>) -> Option<String> {
454 extract_feed_info(document, base_url)
455 .ok()
456 .and_then(|i| i.primary_feed().map(|f| f.url.clone()))
457}
458
459pub fn get_sitemap(document: &Html, base_url: Option<&Url>) -> Option<String> {
461 extract_feed_info(document, base_url)
462 .ok()
463 .and_then(|i| i.sitemaps.first().map(|s| s.url.clone()))
464}
465
466#[cfg(test)]
471mod tests {
472 use super::*;
473
474 fn parse_html(html: &str) -> Html {
475 Html::parse_document(html)
476 }
477
478 #[test]
479 fn test_extract_rss_feed() {
480 let html = r#"
481 <html>
482 <head>
483 <link rel="alternate" type="application/rss+xml"
484 title="RSS Feed" href="/feed.xml">
485 </head>
486 </html>
487 "#;
488
489 let doc = parse_html(html);
490 let base = Url::parse("https://example.com/").unwrap();
491 let info = extract_feed_info(&doc, Some(&base)).unwrap();
492
493 assert!(info.has_feeds);
494 assert_eq!(info.rss_feeds.len(), 1);
495 assert_eq!(info.rss_feeds[0].url, "https://example.com/feed.xml");
496 assert_eq!(info.rss_feeds[0].title, Some("RSS Feed".to_string()));
497 }
498
499 #[test]
500 fn test_extract_atom_feed() {
501 let html = r#"
502 <html>
503 <head>
504 <link rel="alternate" type="application/atom+xml"
505 title="Atom Feed" href="/atom.xml">
506 </head>
507 </html>
508 "#;
509
510 let doc = parse_html(html);
511 let base = Url::parse("https://example.com/").unwrap();
512 let info = extract_feed_info(&doc, Some(&base)).unwrap();
513
514 assert!(info.has_feeds);
515 assert_eq!(info.atom_feeds.len(), 1);
516 assert_eq!(info.atom_feeds[0].feed_type, FeedType::Atom);
517 }
518
519 #[test]
520 fn test_extract_json_feed() {
521 let html = r#"
522 <html>
523 <head>
524 <link rel="alternate" type="application/feed+json"
525 title="JSON Feed" href="/feed.json">
526 </head>
527 </html>
528 "#;
529
530 let doc = parse_html(html);
531 let info = extract_feed_info(&doc, None).unwrap();
532
533 assert!(info.has_feeds);
534 assert_eq!(info.json_feeds.len(), 1);
535 assert_eq!(info.json_feeds[0].feed_type, FeedType::Json);
536 }
537
538 #[test]
539 fn test_extract_multiple_feeds() {
540 let html = r#"
541 <html>
542 <head>
543 <link rel="alternate" type="application/rss+xml" href="/rss.xml">
544 <link rel="alternate" type="application/atom+xml" href="/atom.xml">
545 </head>
546 </html>
547 "#;
548
549 let doc = parse_html(html);
550 let info = extract_feed_info(&doc, None).unwrap();
551
552 assert_eq!(info.all_feeds().len(), 2);
553 assert_eq!(info.primary_feed().unwrap().feed_type, FeedType::Atom);
555 }
556
557 #[test]
558 fn test_extract_sitemap_link() {
559 let html = r#"
560 <html>
561 <body>
562 <footer>
563 <a href="/sitemap.xml">Sitemap</a>
564 </footer>
565 </body>
566 </html>
567 "#;
568
569 let doc = parse_html(html);
570 let base = Url::parse("https://example.com/").unwrap();
571 let info = extract_feed_info(&doc, Some(&base)).unwrap();
572
573 assert!(info.has_sitemaps);
574 assert_eq!(info.sitemaps[0].url, "https://example.com/sitemap.xml");
575 }
576
577 #[test]
578 fn test_detect_sitemap_types() {
579 assert_eq!(detect_sitemap_type("/sitemap.xml"), SitemapType::Xml);
580 assert_eq!(detect_sitemap_type("/sitemap.xml.gz"), SitemapType::Gzip);
581 assert_eq!(detect_sitemap_type("/sitemap_index.xml"), SitemapType::Index);
582 assert_eq!(detect_sitemap_type("/news-sitemap.xml"), SitemapType::News);
583 assert_eq!(detect_sitemap_type("/image-sitemap.xml"), SitemapType::Image);
584 assert_eq!(detect_sitemap_type("/sitemap.txt"), SitemapType::Text);
585 }
586
587 #[test]
588 fn test_detect_feed_type_from_url() {
589 assert_eq!(detect_feed_type_from_url("/atom.xml"), FeedType::Atom);
590 assert_eq!(detect_feed_type_from_url("/rss.xml"), FeedType::Rss2);
591 assert_eq!(detect_feed_type_from_url("/feed.json"), FeedType::Json);
592 assert_eq!(detect_feed_type_from_url("/feed"), FeedType::Rss2);
593 }
594
595 #[test]
596 fn test_generate_feed_urls() {
597 let base = Url::parse("https://example.com/").unwrap();
598 let urls = generate_feed_urls(&base);
599
600 assert!(urls.contains(&"https://example.com/feed".to_string()));
601 assert!(urls.contains(&"https://example.com/rss.xml".to_string()));
602 assert!(urls.contains(&"https://example.com/atom.xml".to_string()));
603 }
604
605 #[test]
606 fn test_generate_sitemap_urls() {
607 let base = Url::parse("https://example.com/").unwrap();
608 let urls = generate_sitemap_urls(&base);
609
610 assert!(urls.contains(&"https://example.com/sitemap.xml".to_string()));
611 assert!(urls.contains(&"https://example.com/sitemap_index.xml".to_string()));
612 }
613
614 #[test]
615 fn test_feed_info_methods() {
616 let mut info = FeedInfo::new();
617 info.rss_feeds.push(Feed::new("/rss".to_string(), FeedType::Rss2));
618 info.atom_feeds.push(Feed::new("/atom".to_string(), FeedType::Atom));
619
620 assert_eq!(info.all_feeds().len(), 2);
621 assert_eq!(info.feed_urls(), vec!["/rss", "/atom"]);
622 assert_eq!(info.primary_feed().unwrap().feed_type, FeedType::Atom);
623 }
624
625 #[test]
626 fn test_feed_is_rss_atom() {
627 let rss = Feed::new("/feed".to_string(), FeedType::Rss2);
628 let atom = Feed::new("/atom".to_string(), FeedType::Atom);
629
630 assert!(rss.is_rss());
631 assert!(!rss.is_atom());
632 assert!(atom.is_atom());
633 assert!(!atom.is_rss());
634 }
635
636 #[test]
637 fn test_no_feeds() {
638 let html = "<html><body><p>No feeds here</p></body></html>";
639 let doc = parse_html(html);
640 let info = extract_feed_info(&doc, None).unwrap();
641
642 assert!(!info.has_feeds);
643 assert!(!info.has_sitemaps);
644 }
645
646 #[test]
647 fn test_feed_with_hreflang() {
648 let html = r#"
649 <html>
650 <head>
651 <link rel="alternate" type="application/rss+xml"
652 hreflang="en" href="/feed-en.xml">
653 <link rel="alternate" type="application/rss+xml"
654 hreflang="fr" href="/feed-fr.xml">
655 </head>
656 </html>
657 "#;
658
659 let doc = parse_html(html);
660 let info = extract_feed_info(&doc, None).unwrap();
661
662 assert_eq!(info.rss_feeds.len(), 2);
663 assert_eq!(info.rss_feeds[0].language, Some("en".to_string()));
664 assert_eq!(info.rss_feeds[1].language, Some("fr".to_string()));
665 }
666}