halldyll_parser/
pagination.rs

1//! Pagination detection for halldyll-parser
2//!
3//! This module handles:
4//! - rel="next"/rel="prev" link detection
5//! - Pagination URL patterns
6//! - Page number extraction
7//! - Infinite scroll detection
8//! - Load more button detection
9
10use regex::Regex;
11use scraper::{Html, Selector};
12use serde::{Deserialize, Serialize};
13use lazy_static::lazy_static;
14use url::Url;
15
16use crate::types::ParserResult;
17
18// ============================================================================
19// TYPES
20// ============================================================================
21
22/// Pagination information for a page
23#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
24pub struct Pagination {
25    /// Current page number (if detected)
26    pub current_page: Option<u32>,
27    /// Total pages (if detected)
28    pub total_pages: Option<u32>,
29    /// Previous page URL (rel="prev")
30    pub prev_url: Option<String>,
31    /// Next page URL (rel="next")
32    pub next_url: Option<String>,
33    /// First page URL
34    pub first_url: Option<String>,
35    /// Last page URL
36    pub last_url: Option<String>,
37    /// All detected page URLs with their numbers
38    pub page_urls: Vec<PageUrl>,
39    /// Pagination type detected
40    pub pagination_type: PaginationType,
41    /// Whether infinite scroll is detected
42    pub has_infinite_scroll: bool,
43    /// Whether "load more" button is detected
44    pub has_load_more: bool,
45    /// Items per page (if detected)
46    pub items_per_page: Option<u32>,
47    /// Total items (if detected)
48    pub total_items: Option<u32>,
49}
50
51impl Pagination {
52    pub fn new() -> Self {
53        Self::default()
54    }
55
56    /// Check if pagination exists
57    pub fn has_pagination(&self) -> bool {
58        self.prev_url.is_some() || 
59        self.next_url.is_some() || 
60        !self.page_urls.is_empty() ||
61        self.has_infinite_scroll ||
62        self.has_load_more
63    }
64
65    /// Check if there's a next page
66    pub fn has_next(&self) -> bool {
67        self.next_url.is_some()
68    }
69
70    /// Check if there's a previous page
71    pub fn has_prev(&self) -> bool {
72        self.prev_url.is_some()
73    }
74
75    /// Check if this is the first page
76    pub fn is_first_page(&self) -> bool {
77        self.prev_url.is_none() && self.current_page.map(|p| p <= 1).unwrap_or(true)
78    }
79
80    /// Check if this is the last page
81    pub fn is_last_page(&self) -> bool {
82        self.next_url.is_none() && 
83        self.current_page.is_some() && 
84        self.total_pages.is_some() &&
85        self.current_page == self.total_pages
86    }
87
88    /// Get all URLs to crawl for complete pagination
89    pub fn all_page_urls(&self) -> Vec<String> {
90        let mut urls: Vec<String> = self.page_urls.iter()
91            .map(|p| p.url.clone())
92            .collect();
93        
94        if let Some(ref url) = self.first_url {
95            if !urls.contains(url) {
96                urls.insert(0, url.clone());
97            }
98        }
99        if let Some(ref url) = self.last_url {
100            if !urls.contains(url) {
101                urls.push(url.clone());
102            }
103        }
104        
105        urls
106    }
107}
108
109/// A page URL with its page number
110#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
111pub struct PageUrl {
112    pub url: String,
113    pub page_number: Option<u32>,
114    pub is_current: bool,
115}
116
117/// Type of pagination detected
118#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
119pub enum PaginationType {
120    /// Standard numbered pagination (1, 2, 3...)
121    Numbered,
122    /// Next/Previous only
123    NextPrev,
124    /// Infinite scroll
125    InfiniteScroll,
126    /// Load more button
127    LoadMore,
128    /// Cursor-based pagination
129    Cursor,
130    /// Offset-based pagination
131    Offset,
132    /// Unknown or no pagination
133    #[default]
134    None,
135}
136
137// ============================================================================
138// LAZY STATIC PATTERNS
139// ============================================================================
140
141lazy_static! {
142    /// Pattern for page number in URL query string
143    static ref PAGE_QUERY_PATTERN: Regex = Regex::new(
144        r"(?i)[?&](page|p|pg|pn|pagenum|pagenumber|offset|start|from)=(\d+)"
145    ).unwrap();
146    
147    /// Pattern for page number in URL path
148    static ref PAGE_PATH_PATTERN: Regex = Regex::new(
149        r"(?i)/(?:page|p|pg)/(\d+)"
150    ).unwrap();
151    
152    /// Pattern for page number at end of path
153    static ref PAGE_END_PATTERN: Regex = Regex::new(
154        r"/(\d+)/?$"
155    ).unwrap();
156    
157    /// Pattern for "showing X of Y" text
158    static ref SHOWING_PATTERN: Regex = Regex::new(
159        r"(?i)(?:showing|displaying)\s+(?:\d+[-–]\d+|\d+)\s+(?:of|from)\s+(\d+)"
160    ).unwrap();
161    
162    /// Pattern for "page X of Y" text
163    static ref PAGE_OF_PATTERN: Regex = Regex::new(
164        r"(?i)page\s+(\d+)\s+(?:of|/)\s+(\d+)"
165    ).unwrap();
166    
167    /// Pattern for items per page
168    static ref ITEMS_PER_PAGE_PATTERN: Regex = Regex::new(
169        r"(?i)(\d+)\s+(?:per\s+page|results|items)"
170    ).unwrap();
171}
172
173// ============================================================================
174// EXTRACTION FUNCTIONS
175// ============================================================================
176
177/// Extract pagination information from HTML document
178pub fn extract_pagination(document: &Html, base_url: Option<&Url>) -> ParserResult<Pagination> {
179    let mut pagination = Pagination::new();
180
181    // Extract rel="next" and rel="prev" links
182    extract_rel_links(document, &mut pagination, base_url);
183
184    // Extract pagination from link elements
185    extract_page_links(document, &mut pagination, base_url);
186
187    // Detect current page and total pages from text
188    extract_page_info_from_text(document, &mut pagination);
189
190    // Detect infinite scroll
191    pagination.has_infinite_scroll = detect_infinite_scroll(document);
192
193    // Detect load more button
194    pagination.has_load_more = detect_load_more(document);
195
196    // Determine pagination type
197    pagination.pagination_type = determine_pagination_type(&pagination);
198
199    Ok(pagination)
200}
201
202/// Extract rel="next" and rel="prev" links from <link> elements
203fn extract_rel_links(document: &Html, pagination: &mut Pagination, base_url: Option<&Url>) {
204    // rel="next"
205    if let Ok(sel) = Selector::parse("link[rel='next'], a[rel='next']") {
206        if let Some(el) = document.select(&sel).next() {
207            if let Some(href) = el.value().attr("href") {
208                pagination.next_url = resolve_url(href, base_url);
209            }
210        }
211    }
212
213    // rel="prev" or rel="previous"
214    if let Ok(sel) = Selector::parse("link[rel='prev'], link[rel='previous'], a[rel='prev'], a[rel='previous']") {
215        if let Some(el) = document.select(&sel).next() {
216            if let Some(href) = el.value().attr("href") {
217                pagination.prev_url = resolve_url(href, base_url);
218            }
219        }
220    }
221
222    // rel="first"
223    if let Ok(sel) = Selector::parse("link[rel='first'], a[rel='first']") {
224        if let Some(el) = document.select(&sel).next() {
225            if let Some(href) = el.value().attr("href") {
226                pagination.first_url = resolve_url(href, base_url);
227            }
228        }
229    }
230
231    // rel="last"
232    if let Ok(sel) = Selector::parse("link[rel='last'], a[rel='last']") {
233        if let Some(el) = document.select(&sel).next() {
234            if let Some(href) = el.value().attr("href") {
235                pagination.last_url = resolve_url(href, base_url);
236            }
237        }
238    }
239}
240
241/// Extract page links from common pagination patterns
242fn extract_page_links(document: &Html, pagination: &mut Pagination, base_url: Option<&Url>) {
243    // Common pagination selectors
244    let pagination_selectors = [
245        ".pagination a",
246        ".pager a",
247        ".page-numbers a",
248        ".pages a",
249        "nav.pagination a",
250        "[class*='pagination'] a",
251        "[class*='pager'] a",
252        "[aria-label='pagination'] a",
253        "[role='navigation'] a[href*='page']",
254    ];
255
256    let mut seen_urls = std::collections::HashSet::new();
257
258    for selector_str in pagination_selectors {
259        if let Ok(sel) = Selector::parse(selector_str) {
260            for el in document.select(&sel) {
261                if let Some(href) = el.value().attr("href") {
262                    let resolved = resolve_url(href, base_url);
263                    
264                    if let Some(ref url) = resolved {
265                        if seen_urls.contains(url) {
266                            continue;
267                        }
268                        seen_urls.insert(url.clone());
269
270                        // Try to extract page number
271                        let page_number = extract_page_number_from_url(url)
272                            .or_else(|| {
273                                // Try from link text
274                                let text = el.text().collect::<String>().trim().to_string();
275                                text.parse::<u32>().ok()
276                            });
277
278                        // Check if this is current page
279                        let is_current = el.value().classes().any(|c| 
280                            c.contains("current") || c.contains("active") || c.contains("selected")
281                        ) || el.value().attr("aria-current").is_some();
282
283                        let page_url = PageUrl {
284                            url: url.clone(),
285                            page_number,
286                            is_current,
287                        };
288
289                        // Update current page if found
290                        if is_current {
291                            pagination.current_page = page_number;
292                        }
293
294                        pagination.page_urls.push(page_url);
295                    }
296                }
297            }
298        }
299    }
300
301    // Sort by page number
302    pagination.page_urls.sort_by(|a, b| {
303        match (a.page_number, b.page_number) {
304            (Some(a), Some(b)) => a.cmp(&b),
305            (Some(_), None) => std::cmp::Ordering::Less,
306            (None, Some(_)) => std::cmp::Ordering::Greater,
307            (None, None) => std::cmp::Ordering::Equal,
308        }
309    });
310
311    // Deduplicate
312    pagination.page_urls.dedup_by(|a, b| a.url == b.url);
313
314    // Try to determine total pages from max page number
315    if let Some(max) = pagination.page_urls.iter().filter_map(|p| p.page_number).max() {
316        if pagination.total_pages.is_none() {
317            pagination.total_pages = Some(max);
318        }
319    }
320}
321
322/// Extract page number from URL
323pub fn extract_page_number_from_url(url: &str) -> Option<u32> {
324    // Try query parameter patterns
325    if let Some(caps) = PAGE_QUERY_PATTERN.captures(url) {
326        if let Some(num) = caps.get(2) {
327            return num.as_str().parse().ok();
328        }
329    }
330
331    // Try path patterns like /page/2
332    if let Some(caps) = PAGE_PATH_PATTERN.captures(url) {
333        if let Some(num) = caps.get(1) {
334            return num.as_str().parse().ok();
335        }
336    }
337
338    // Try number at end of path
339    if let Some(caps) = PAGE_END_PATTERN.captures(url) {
340        if let Some(num) = caps.get(1) {
341            return num.as_str().parse().ok();
342        }
343    }
344
345    None
346}
347
348/// Extract page info from text content
349fn extract_page_info_from_text(document: &Html, pagination: &mut Pagination) {
350    let body_text = document.root_element().text().collect::<String>();
351
352    // Try "page X of Y" pattern
353    if let Some(caps) = PAGE_OF_PATTERN.captures(&body_text) {
354        if let (Some(current), Some(total)) = (caps.get(1), caps.get(2)) {
355            if pagination.current_page.is_none() {
356                pagination.current_page = current.as_str().parse().ok();
357            }
358            if pagination.total_pages.is_none() {
359                pagination.total_pages = total.as_str().parse().ok();
360            }
361        }
362    }
363
364    // Try "showing X of Y" pattern for total items
365    if let Some(caps) = SHOWING_PATTERN.captures(&body_text) {
366        if let Some(total) = caps.get(1) {
367            pagination.total_items = total.as_str().parse().ok();
368        }
369    }
370
371    // Try items per page pattern
372    if let Some(caps) = ITEMS_PER_PAGE_PATTERN.captures(&body_text) {
373        if let Some(per_page) = caps.get(1) {
374            pagination.items_per_page = per_page.as_str().parse().ok();
375        }
376    }
377}
378
379/// Detect infinite scroll
380fn detect_infinite_scroll(document: &Html) -> bool {
381    let html = document.html().to_lowercase();
382
383    // Check for common infinite scroll libraries/patterns
384    html.contains("infinite-scroll") ||
385    html.contains("infinitescroll") ||
386    html.contains("infinite_scroll") ||
387    html.contains("data-infinite") ||
388    html.contains("waypoint") ||
389    html.contains("scroll-trigger") ||
390    html.contains("lazy-load") && html.contains("scroll")
391}
392
393/// Detect load more button
394fn detect_load_more(document: &Html) -> bool {
395    let load_more_selectors = [
396        "button[class*='load-more']",
397        "button[class*='loadmore']",
398        "a[class*='load-more']",
399        "a[class*='loadmore']",
400        "[class*='show-more']",
401        "[class*='showmore']",
402        "[data-action='load-more']",
403    ];
404
405    for selector_str in load_more_selectors {
406        if let Ok(sel) = Selector::parse(selector_str) {
407            if document.select(&sel).next().is_some() {
408                return true;
409            }
410        }
411    }
412
413    // Also check for common text patterns in buttons
414    if let Ok(sel) = Selector::parse("button, a.btn, a.button") {
415        for el in document.select(&sel) {
416            let text = el.text().collect::<String>().to_lowercase();
417            if text.contains("load more") || 
418               text.contains("show more") ||
419               text.contains("view more") ||
420               text.contains("see more") {
421                return true;
422            }
423        }
424    }
425
426    false
427}
428
429/// Determine pagination type
430fn determine_pagination_type(pagination: &Pagination) -> PaginationType {
431    if pagination.has_infinite_scroll {
432        return PaginationType::InfiniteScroll;
433    }
434
435    if pagination.has_load_more {
436        return PaginationType::LoadMore;
437    }
438
439    if !pagination.page_urls.is_empty() {
440        return PaginationType::Numbered;
441    }
442
443    if pagination.next_url.is_some() || pagination.prev_url.is_some() {
444        return PaginationType::NextPrev;
445    }
446
447    PaginationType::None
448}
449
450/// Resolve URL relative to base
451fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
452    if href.starts_with("http://") || href.starts_with("https://") {
453        return Some(href.to_string());
454    }
455
456    if href.starts_with("//") {
457        return Some(format!("https:{}", href));
458    }
459
460    if let Some(base) = base_url {
461        return base.join(href).ok().map(|u| u.to_string());
462    }
463
464    None
465}
466
467// ============================================================================
468// CONVENIENCE FUNCTIONS
469// ============================================================================
470
471/// Check if document has pagination
472pub fn has_pagination(document: &Html) -> bool {
473    extract_pagination(document, None)
474        .map(|p| p.has_pagination())
475        .unwrap_or(false)
476}
477
478/// Get next page URL if exists
479pub fn get_next_page(document: &Html, base_url: Option<&Url>) -> Option<String> {
480    extract_pagination(document, base_url)
481        .ok()
482        .and_then(|p| p.next_url)
483}
484
485/// Get previous page URL if exists
486pub fn get_prev_page(document: &Html, base_url: Option<&Url>) -> Option<String> {
487    extract_pagination(document, base_url)
488        .ok()
489        .and_then(|p| p.prev_url)
490}
491
492/// Generate pagination URL for a specific page number
493pub fn generate_page_url(base_url: &str, page_number: u32, pattern: &str) -> String {
494    pattern.replace("{page}", &page_number.to_string())
495        .replace("{url}", base_url)
496}
497
498// ============================================================================
499// TESTS
500// ============================================================================
501
502#[cfg(test)]
503mod tests {
504    use super::*;
505
506    fn parse_html(html: &str) -> Html {
507        Html::parse_document(html)
508    }
509
510    #[test]
511    fn test_extract_rel_next_prev() {
512        let html = r#"
513            <html>
514            <head>
515                <link rel="prev" href="/page/1">
516                <link rel="next" href="/page/3">
517            </head>
518            <body></body>
519            </html>
520        "#;
521
522        let doc = parse_html(html);
523        let base = Url::parse("https://example.com/page/2").unwrap();
524        let pagination = extract_pagination(&doc, Some(&base)).unwrap();
525
526        assert_eq!(pagination.prev_url, Some("https://example.com/page/1".to_string()));
527        assert_eq!(pagination.next_url, Some("https://example.com/page/3".to_string()));
528        assert!(pagination.has_pagination());
529    }
530
531    #[test]
532    fn test_extract_numbered_pagination() {
533        let html = r#"
534            <div class="pagination">
535                <a href="/page/1">1</a>
536                <a href="/page/2" class="active">2</a>
537                <a href="/page/3">3</a>
538                <a href="/page/4">4</a>
539            </div>
540        "#;
541
542        let doc = parse_html(html);
543        let base = Url::parse("https://example.com/page/2").unwrap();
544        let pagination = extract_pagination(&doc, Some(&base)).unwrap();
545
546        assert_eq!(pagination.page_urls.len(), 4);
547        assert_eq!(pagination.current_page, Some(2));
548        assert_eq!(pagination.total_pages, Some(4));
549        assert_eq!(pagination.pagination_type, PaginationType::Numbered);
550    }
551
552    #[test]
553    fn test_detect_infinite_scroll() {
554        let html = r#"
555            <html>
556            <body>
557                <div class="infinite-scroll" data-infinite="true">
558                    Content here
559                </div>
560            </body>
561            </html>
562        "#;
563
564        let doc = parse_html(html);
565        let pagination = extract_pagination(&doc, None).unwrap();
566
567        assert!(pagination.has_infinite_scroll);
568        assert_eq!(pagination.pagination_type, PaginationType::InfiniteScroll);
569    }
570
571    #[test]
572    fn test_detect_load_more() {
573        let html = r#"
574            <html>
575            <body>
576                <div class="items">Items...</div>
577                <button class="load-more">Load More</button>
578            </body>
579            </html>
580        "#;
581
582        let doc = parse_html(html);
583        let pagination = extract_pagination(&doc, None).unwrap();
584
585        assert!(pagination.has_load_more);
586    }
587
588    #[test]
589    fn test_extract_page_number_from_url() {
590        assert_eq!(extract_page_number_from_url("/articles?page=5"), Some(5));
591        assert_eq!(extract_page_number_from_url("/blog/page/3"), Some(3));
592        assert_eq!(extract_page_number_from_url("/posts?p=10"), Some(10));
593        assert_eq!(extract_page_number_from_url("/items?offset=20"), Some(20));
594        assert_eq!(extract_page_number_from_url("/no-page-here"), None);
595    }
596
597    #[test]
598    fn test_page_of_text_detection() {
599        let html = r#"
600            <html>
601            <body>
602                <p>Page 3 of 10</p>
603                <p>Showing 21-30 of 100 results</p>
604            </body>
605            </html>
606        "#;
607
608        let doc = parse_html(html);
609        let pagination = extract_pagination(&doc, None).unwrap();
610
611        assert_eq!(pagination.current_page, Some(3));
612        assert_eq!(pagination.total_pages, Some(10));
613        assert_eq!(pagination.total_items, Some(100));
614    }
615
616    #[test]
617    fn test_is_first_last_page() {
618        let mut pagination = Pagination::new();
619        pagination.current_page = Some(1);
620        pagination.total_pages = Some(5);
621        
622        assert!(pagination.is_first_page());
623        assert!(!pagination.is_last_page());
624
625        pagination.current_page = Some(5);
626        assert!(!pagination.is_first_page());
627        assert!(pagination.is_last_page());
628    }
629
630    #[test]
631    fn test_all_page_urls() {
632        let mut pagination = Pagination::new();
633        pagination.first_url = Some("/page/1".to_string());
634        pagination.last_url = Some("/page/5".to_string());
635        pagination.page_urls = vec![
636            PageUrl { url: "/page/2".to_string(), page_number: Some(2), is_current: false },
637            PageUrl { url: "/page/3".to_string(), page_number: Some(3), is_current: true },
638        ];
639
640        let all_urls = pagination.all_page_urls();
641        assert_eq!(all_urls.len(), 4);
642        assert_eq!(all_urls[0], "/page/1");
643        assert_eq!(all_urls[3], "/page/5");
644    }
645
646    #[test]
647    fn test_generate_page_url() {
648        let url = generate_page_url("https://example.com", 5, "{url}/page/{page}");
649        assert_eq!(url, "https://example.com/page/5");
650    }
651
652    #[test]
653    fn test_no_pagination() {
654        let html = "<html><body><p>Just content, no pagination</p></body></html>";
655        let doc = parse_html(html);
656        let pagination = extract_pagination(&doc, None).unwrap();
657
658        assert!(!pagination.has_pagination());
659        assert_eq!(pagination.pagination_type, PaginationType::None);
660    }
661
662    #[test]
663    fn test_load_more_text_button() {
664        let html = r#"
665            <html>
666            <body>
667                <button class="btn">Load more items</button>
668            </body>
669            </html>
670        "#;
671
672        let doc = parse_html(html);
673        let pagination = extract_pagination(&doc, None).unwrap();
674
675        assert!(pagination.has_load_more);
676    }
677
678    #[test]
679    fn test_aria_pagination() {
680        let html = r#"
681            <nav aria-label="pagination">
682                <a href="/page/1">1</a>
683                <a href="/page/2" aria-current="page">2</a>
684                <a href="/page/3">3</a>
685            </nav>
686        "#;
687
688        let doc = parse_html(html);
689        let base = Url::parse("https://example.com/").unwrap();
690        let pagination = extract_pagination(&doc, Some(&base)).unwrap();
691
692        assert!(!pagination.page_urls.is_empty());
693    }
694}