halldyll_parser/
feeds.rs

1//! Feed and sitemap detection for halldyll-parser
2//!
3//! This module handles detection and extraction of:
4//! - RSS feeds
5//! - Atom feeds
6//! - Sitemap XML
7//! - Sitemap index
8//! - JSON Feed
9
10use scraper::{Html, Selector};
11use serde::{Deserialize, Serialize};
12use url::Url;
13
14use crate::types::ParserResult;
15
16// ============================================================================
17// TYPES
18// ============================================================================
19
20/// All feeds and sitemaps found on a page
21#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
22pub struct FeedInfo {
23    /// RSS feeds found
24    pub rss_feeds: Vec<Feed>,
25    /// Atom feeds found
26    pub atom_feeds: Vec<Feed>,
27    /// JSON feeds found
28    pub json_feeds: Vec<Feed>,
29    /// Sitemap URLs found
30    pub sitemaps: Vec<Sitemap>,
31    /// Whether page has any feeds
32    pub has_feeds: bool,
33    /// Whether page has sitemaps
34    pub has_sitemaps: bool,
35}
36
37impl FeedInfo {
38    pub fn new() -> Self {
39        Self::default()
40    }
41
42    /// Get all feeds (RSS + Atom + JSON)
43    pub fn all_feeds(&self) -> Vec<&Feed> {
44        self.rss_feeds.iter()
45            .chain(self.atom_feeds.iter())
46            .chain(self.json_feeds.iter())
47            .collect()
48    }
49
50    /// Get primary feed (prefer Atom, then RSS, then JSON)
51    pub fn primary_feed(&self) -> Option<&Feed> {
52        self.atom_feeds.first()
53            .or_else(|| self.rss_feeds.first())
54            .or_else(|| self.json_feeds.first())
55    }
56
57    /// Get all feed URLs
58    pub fn feed_urls(&self) -> Vec<&str> {
59        self.all_feeds().iter().map(|f| f.url.as_str()).collect()
60    }
61
62    /// Get all sitemap URLs
63    pub fn sitemap_urls(&self) -> Vec<&str> {
64        self.sitemaps.iter().map(|s| s.url.as_str()).collect()
65    }
66}
67
68/// A web feed (RSS, Atom, or JSON)
69#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
70pub struct Feed {
71    /// Feed URL
72    pub url: String,
73    /// Feed title (if available)
74    pub title: Option<String>,
75    /// Feed type
76    pub feed_type: FeedType,
77    /// MIME type from link element
78    pub mime_type: Option<String>,
79    /// Language hint
80    pub language: Option<String>,
81}
82
83impl Feed {
84    pub fn new(url: String, feed_type: FeedType) -> Self {
85        Self {
86            url,
87            title: None,
88            feed_type,
89            mime_type: None,
90            language: None,
91        }
92    }
93
94    /// Check if this is an RSS feed
95    pub fn is_rss(&self) -> bool {
96        matches!(self.feed_type, FeedType::Rss | FeedType::Rss2)
97    }
98
99    /// Check if this is an Atom feed
100    pub fn is_atom(&self) -> bool {
101        matches!(self.feed_type, FeedType::Atom)
102    }
103}
104
105/// Type of web feed
106#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
107pub enum FeedType {
108    /// RSS 1.0
109    Rss,
110    /// RSS 2.0
111    #[default]
112    Rss2,
113    /// Atom
114    Atom,
115    /// JSON Feed
116    Json,
117    /// Unknown/Generic
118    Unknown,
119}
120
121/// A sitemap reference
122#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
123pub struct Sitemap {
124    /// Sitemap URL
125    pub url: String,
126    /// Sitemap type
127    pub sitemap_type: SitemapType,
128    /// Source of discovery
129    pub source: SitemapSource,
130}
131
132impl Sitemap {
133    pub fn new(url: String, sitemap_type: SitemapType) -> Self {
134        Self {
135            url,
136            sitemap_type,
137            source: SitemapSource::LinkTag,
138        }
139    }
140}
141
142/// Type of sitemap
143#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
144pub enum SitemapType {
145    /// Standard XML sitemap
146    #[default]
147    Xml,
148    /// Sitemap index (contains other sitemaps)
149    Index,
150    /// News sitemap
151    News,
152    /// Image sitemap
153    Image,
154    /// Video sitemap
155    Video,
156    /// Text sitemap
157    Text,
158    /// Gzip compressed sitemap
159    Gzip,
160}
161
162/// How sitemap was discovered
163#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
164pub enum SitemapSource {
165    /// From <link> tag
166    #[default]
167    LinkTag,
168    /// From robots.txt
169    RobotsTxt,
170    /// From well-known path
171    WellKnown,
172    /// From sitemap index
173    SitemapIndex,
174}
175
176// ============================================================================
177// EXTRACTION FUNCTIONS
178// ============================================================================
179
180/// Extract all feed and sitemap information from HTML document
181pub fn extract_feed_info(document: &Html, base_url: Option<&Url>) -> ParserResult<FeedInfo> {
182    let mut info = FeedInfo::new();
183
184    // Extract feeds from <link> elements
185    extract_link_feeds(document, &mut info, base_url);
186
187    // Extract sitemaps
188    extract_sitemaps(document, &mut info, base_url);
189
190    // Update flags
191    info.has_feeds = !info.rss_feeds.is_empty() || 
192                     !info.atom_feeds.is_empty() || 
193                     !info.json_feeds.is_empty();
194    info.has_sitemaps = !info.sitemaps.is_empty();
195
196    Ok(info)
197}
198
199/// Extract feeds from <link rel="alternate"> elements
200fn extract_link_feeds(document: &Html, info: &mut FeedInfo, base_url: Option<&Url>) {
201    // RSS and Atom feeds
202    let feed_selector = Selector::parse(
203        r#"link[rel="alternate"][type="application/rss+xml"],
204           link[rel="alternate"][type="application/atom+xml"],
205           link[rel="alternate"][type="application/feed+json"],
206           link[rel="alternate"][type="application/json"]"#
207    ).unwrap();
208
209    for el in document.select(&feed_selector) {
210        let href = match el.value().attr("href") {
211            Some(h) => h,
212            None => continue,
213        };
214
215        let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
216        let mime_type = el.value().attr("type").map(|s| s.to_string());
217        let title = el.value().attr("title").map(|s| s.to_string());
218        let hreflang = el.value().attr("hreflang").map(|s| s.to_string());
219
220        let feed_type = detect_feed_type(&mime_type, &url);
221
222        let mut feed = Feed::new(url, feed_type);
223        feed.title = title;
224        feed.mime_type = mime_type;
225        feed.language = hreflang;
226
227        match feed_type {
228            FeedType::Atom => info.atom_feeds.push(feed),
229            FeedType::Json => info.json_feeds.push(feed),
230            _ => info.rss_feeds.push(feed),
231        }
232    }
233
234    // Also check for feed links in <a> elements (common pattern)
235    if let Ok(sel) = Selector::parse("a[href*='feed'], a[href*='rss'], a[href*='atom']") {
236        for el in document.select(&sel) {
237            if let Some(href) = el.value().attr("href") {
238                let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
239                
240                // Skip if already found
241                if info.all_feeds().iter().any(|f| f.url == url) {
242                    continue;
243                }
244
245                // Detect type from URL
246                let feed_type = detect_feed_type_from_url(&url);
247                if feed_type == FeedType::Unknown {
248                    continue;
249                }
250
251                let mut feed = Feed::new(url, feed_type);
252                feed.title = Some(el.text().collect::<String>().trim().to_string());
253
254                match feed_type {
255                    FeedType::Atom => info.atom_feeds.push(feed),
256                    FeedType::Json => info.json_feeds.push(feed),
257                    _ => info.rss_feeds.push(feed),
258                }
259            }
260        }
261    }
262}
263
264/// Detect feed type from MIME type and URL
265fn detect_feed_type(mime_type: &Option<String>, url: &str) -> FeedType {
266    if let Some(ref mime) = mime_type {
267        match mime.as_str() {
268            "application/atom+xml" => return FeedType::Atom,
269            "application/rss+xml" => return FeedType::Rss2,
270            "application/feed+json" | "application/json" => {
271                if url.contains("feed") {
272                    return FeedType::Json;
273                }
274            }
275            _ => {}
276        }
277    }
278
279    detect_feed_type_from_url(url)
280}
281
282/// Detect feed type from URL patterns
283fn detect_feed_type_from_url(url: &str) -> FeedType {
284    let url_lower = url.to_lowercase();
285    
286    if url_lower.contains("atom") {
287        FeedType::Atom
288    } else if url_lower.contains("rss") || url_lower.contains("feed.xml") {
289        FeedType::Rss2
290    } else if url_lower.ends_with("feed.json") || url_lower.contains("feed/json") {
291        FeedType::Json
292    } else if url_lower.contains("feed") || url_lower.ends_with(".xml") {
293        FeedType::Rss2
294    } else {
295        FeedType::Unknown
296    }
297}
298
299/// Extract sitemap references
300fn extract_sitemaps(document: &Html, info: &mut FeedInfo, base_url: Option<&Url>) {
301    // From <link rel="sitemap"> (less common but valid)
302    if let Ok(sel) = Selector::parse("link[rel='sitemap']") {
303        for el in document.select(&sel) {
304            if let Some(href) = el.value().attr("href") {
305                let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
306                let sitemap_type = detect_sitemap_type(&url);
307                
308                let mut sitemap = Sitemap::new(url, sitemap_type);
309                sitemap.source = SitemapSource::LinkTag;
310                info.sitemaps.push(sitemap);
311            }
312        }
313    }
314
315    // Look for sitemap links in footer/nav
316    if let Ok(sel) = Selector::parse("a[href*='sitemap']") {
317        for el in document.select(&sel) {
318            if let Some(href) = el.value().attr("href") {
319                let url = resolve_url(href, base_url).unwrap_or_else(|| href.to_string());
320                
321                // Skip if already found
322                if info.sitemaps.iter().any(|s| s.url == url) {
323                    continue;
324                }
325
326                let sitemap_type = detect_sitemap_type(&url);
327                let mut sitemap = Sitemap::new(url, sitemap_type);
328                sitemap.source = SitemapSource::LinkTag;
329                info.sitemaps.push(sitemap);
330            }
331        }
332    }
333}
334
335/// Detect sitemap type from URL
336fn detect_sitemap_type(url: &str) -> SitemapType {
337    let url_lower = url.to_lowercase();
338    
339    if url_lower.ends_with(".gz") {
340        SitemapType::Gzip
341    } else if url_lower.contains("sitemap_index") || url_lower.contains("sitemap-index") {
342        SitemapType::Index
343    } else if url_lower.contains("news") {
344        SitemapType::News
345    } else if url_lower.contains("image") {
346        SitemapType::Image
347    } else if url_lower.contains("video") {
348        SitemapType::Video
349    } else if url_lower.ends_with(".txt") {
350        SitemapType::Text
351    } else {
352        SitemapType::Xml
353    }
354}
355
356/// Resolve URL relative to base
357fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
358    if href.starts_with("http://") || href.starts_with("https://") {
359        return Some(href.to_string());
360    }
361
362    if href.starts_with("//") {
363        return Some(format!("https:{}", href));
364    }
365
366    if let Some(base) = base_url {
367        return base.join(href).ok().map(|u| u.to_string());
368    }
369
370    None
371}
372
373// ============================================================================
374// WELL-KNOWN FEED/SITEMAP PATHS
375// ============================================================================
376
377/// Common feed paths to check
378pub const COMMON_FEED_PATHS: &[&str] = &[
379    "/feed",
380    "/feed/",
381    "/feed.xml",
382    "/feed.rss",
383    "/rss",
384    "/rss/",
385    "/rss.xml",
386    "/atom.xml",
387    "/atom",
388    "/feed.atom",
389    "/feeds/posts/default",
390    "/blog/feed",
391    "/blog/rss",
392    "/index.xml",
393    "/.rss",
394    "/feed.json",
395];
396
397/// Common sitemap paths to check
398pub const COMMON_SITEMAP_PATHS: &[&str] = &[
399    "/sitemap.xml",
400    "/sitemap_index.xml",
401    "/sitemap",
402    "/sitemaps.xml",
403    "/sitemap1.xml",
404    "/sitemap-index.xml",
405    "/post-sitemap.xml",
406    "/page-sitemap.xml",
407    "/news-sitemap.xml",
408    "/sitemap.xml.gz",
409];
410
411/// Generate potential feed URLs for a domain
412pub fn generate_feed_urls(base_url: &Url) -> Vec<String> {
413    COMMON_FEED_PATHS.iter()
414        .filter_map(|path| base_url.join(path).ok())
415        .map(|u| u.to_string())
416        .collect()
417}
418
419/// Generate potential sitemap URLs for a domain
420pub fn generate_sitemap_urls(base_url: &Url) -> Vec<String> {
421    COMMON_SITEMAP_PATHS.iter()
422        .filter_map(|path| base_url.join(path).ok())
423        .map(|u| u.to_string())
424        .collect()
425}
426
427// ============================================================================
428// CONVENIENCE FUNCTIONS
429// ============================================================================
430
431/// Check if document has any feeds
432pub fn has_feeds(document: &Html) -> bool {
433    extract_feed_info(document, None)
434        .map(|i| i.has_feeds)
435        .unwrap_or(false)
436}
437
438/// Get RSS feed URL if exists
439pub fn get_rss_feed(document: &Html, base_url: Option<&Url>) -> Option<String> {
440    extract_feed_info(document, base_url)
441        .ok()
442        .and_then(|i| i.rss_feeds.first().map(|f| f.url.clone()))
443}
444
445/// Get Atom feed URL if exists
446pub fn get_atom_feed(document: &Html, base_url: Option<&Url>) -> Option<String> {
447    extract_feed_info(document, base_url)
448        .ok()
449        .and_then(|i| i.atom_feeds.first().map(|f| f.url.clone()))
450}
451
452/// Get any feed URL (prefers Atom over RSS)
453pub fn get_feed(document: &Html, base_url: Option<&Url>) -> Option<String> {
454    extract_feed_info(document, base_url)
455        .ok()
456        .and_then(|i| i.primary_feed().map(|f| f.url.clone()))
457}
458
459/// Get sitemap URL if found in document
460pub fn get_sitemap(document: &Html, base_url: Option<&Url>) -> Option<String> {
461    extract_feed_info(document, base_url)
462        .ok()
463        .and_then(|i| i.sitemaps.first().map(|s| s.url.clone()))
464}
465
466// ============================================================================
467// TESTS
468// ============================================================================
469
470#[cfg(test)]
471mod tests {
472    use super::*;
473
474    fn parse_html(html: &str) -> Html {
475        Html::parse_document(html)
476    }
477
478    #[test]
479    fn test_extract_rss_feed() {
480        let html = r#"
481            <html>
482            <head>
483                <link rel="alternate" type="application/rss+xml" 
484                      title="RSS Feed" href="/feed.xml">
485            </head>
486            </html>
487        "#;
488
489        let doc = parse_html(html);
490        let base = Url::parse("https://example.com/").unwrap();
491        let info = extract_feed_info(&doc, Some(&base)).unwrap();
492
493        assert!(info.has_feeds);
494        assert_eq!(info.rss_feeds.len(), 1);
495        assert_eq!(info.rss_feeds[0].url, "https://example.com/feed.xml");
496        assert_eq!(info.rss_feeds[0].title, Some("RSS Feed".to_string()));
497    }
498
499    #[test]
500    fn test_extract_atom_feed() {
501        let html = r#"
502            <html>
503            <head>
504                <link rel="alternate" type="application/atom+xml" 
505                      title="Atom Feed" href="/atom.xml">
506            </head>
507            </html>
508        "#;
509
510        let doc = parse_html(html);
511        let base = Url::parse("https://example.com/").unwrap();
512        let info = extract_feed_info(&doc, Some(&base)).unwrap();
513
514        assert!(info.has_feeds);
515        assert_eq!(info.atom_feeds.len(), 1);
516        assert_eq!(info.atom_feeds[0].feed_type, FeedType::Atom);
517    }
518
519    #[test]
520    fn test_extract_json_feed() {
521        let html = r#"
522            <html>
523            <head>
524                <link rel="alternate" type="application/feed+json" 
525                      title="JSON Feed" href="/feed.json">
526            </head>
527            </html>
528        "#;
529
530        let doc = parse_html(html);
531        let info = extract_feed_info(&doc, None).unwrap();
532
533        assert!(info.has_feeds);
534        assert_eq!(info.json_feeds.len(), 1);
535        assert_eq!(info.json_feeds[0].feed_type, FeedType::Json);
536    }
537
538    #[test]
539    fn test_extract_multiple_feeds() {
540        let html = r#"
541            <html>
542            <head>
543                <link rel="alternate" type="application/rss+xml" href="/rss.xml">
544                <link rel="alternate" type="application/atom+xml" href="/atom.xml">
545            </head>
546            </html>
547        "#;
548
549        let doc = parse_html(html);
550        let info = extract_feed_info(&doc, None).unwrap();
551
552        assert_eq!(info.all_feeds().len(), 2);
553        // Primary should be Atom
554        assert_eq!(info.primary_feed().unwrap().feed_type, FeedType::Atom);
555    }
556
557    #[test]
558    fn test_extract_sitemap_link() {
559        let html = r#"
560            <html>
561            <body>
562                <footer>
563                    <a href="/sitemap.xml">Sitemap</a>
564                </footer>
565            </body>
566            </html>
567        "#;
568
569        let doc = parse_html(html);
570        let base = Url::parse("https://example.com/").unwrap();
571        let info = extract_feed_info(&doc, Some(&base)).unwrap();
572
573        assert!(info.has_sitemaps);
574        assert_eq!(info.sitemaps[0].url, "https://example.com/sitemap.xml");
575    }
576
577    #[test]
578    fn test_detect_sitemap_types() {
579        assert_eq!(detect_sitemap_type("/sitemap.xml"), SitemapType::Xml);
580        assert_eq!(detect_sitemap_type("/sitemap.xml.gz"), SitemapType::Gzip);
581        assert_eq!(detect_sitemap_type("/sitemap_index.xml"), SitemapType::Index);
582        assert_eq!(detect_sitemap_type("/news-sitemap.xml"), SitemapType::News);
583        assert_eq!(detect_sitemap_type("/image-sitemap.xml"), SitemapType::Image);
584        assert_eq!(detect_sitemap_type("/sitemap.txt"), SitemapType::Text);
585    }
586
587    #[test]
588    fn test_detect_feed_type_from_url() {
589        assert_eq!(detect_feed_type_from_url("/atom.xml"), FeedType::Atom);
590        assert_eq!(detect_feed_type_from_url("/rss.xml"), FeedType::Rss2);
591        assert_eq!(detect_feed_type_from_url("/feed.json"), FeedType::Json);
592        assert_eq!(detect_feed_type_from_url("/feed"), FeedType::Rss2);
593    }
594
595    #[test]
596    fn test_generate_feed_urls() {
597        let base = Url::parse("https://example.com/").unwrap();
598        let urls = generate_feed_urls(&base);
599
600        assert!(urls.contains(&"https://example.com/feed".to_string()));
601        assert!(urls.contains(&"https://example.com/rss.xml".to_string()));
602        assert!(urls.contains(&"https://example.com/atom.xml".to_string()));
603    }
604
605    #[test]
606    fn test_generate_sitemap_urls() {
607        let base = Url::parse("https://example.com/").unwrap();
608        let urls = generate_sitemap_urls(&base);
609
610        assert!(urls.contains(&"https://example.com/sitemap.xml".to_string()));
611        assert!(urls.contains(&"https://example.com/sitemap_index.xml".to_string()));
612    }
613
614    #[test]
615    fn test_feed_info_methods() {
616        let mut info = FeedInfo::new();
617        info.rss_feeds.push(Feed::new("/rss".to_string(), FeedType::Rss2));
618        info.atom_feeds.push(Feed::new("/atom".to_string(), FeedType::Atom));
619
620        assert_eq!(info.all_feeds().len(), 2);
621        assert_eq!(info.feed_urls(), vec!["/rss", "/atom"]);
622        assert_eq!(info.primary_feed().unwrap().feed_type, FeedType::Atom);
623    }
624
625    #[test]
626    fn test_feed_is_rss_atom() {
627        let rss = Feed::new("/feed".to_string(), FeedType::Rss2);
628        let atom = Feed::new("/atom".to_string(), FeedType::Atom);
629
630        assert!(rss.is_rss());
631        assert!(!rss.is_atom());
632        assert!(atom.is_atom());
633        assert!(!atom.is_rss());
634    }
635
636    #[test]
637    fn test_no_feeds() {
638        let html = "<html><body><p>No feeds here</p></body></html>";
639        let doc = parse_html(html);
640        let info = extract_feed_info(&doc, None).unwrap();
641
642        assert!(!info.has_feeds);
643        assert!(!info.has_sitemaps);
644    }
645
646    #[test]
647    fn test_feed_with_hreflang() {
648        let html = r#"
649            <html>
650            <head>
651                <link rel="alternate" type="application/rss+xml" 
652                      hreflang="en" href="/feed-en.xml">
653                <link rel="alternate" type="application/rss+xml" 
654                      hreflang="fr" href="/feed-fr.xml">
655            </head>
656            </html>
657        "#;
658
659        let doc = parse_html(html);
660        let info = extract_feed_info(&doc, None).unwrap();
661
662        assert_eq!(info.rss_feeds.len(), 2);
663        assert_eq!(info.rss_feeds[0].language, Some("en".to_string()));
664        assert_eq!(info.rss_feeds[1].language, Some("fr".to_string()));
665    }
666}