1use regex::Regex;
11use scraper::{Html, Selector};
12use serde::{Deserialize, Serialize};
13use lazy_static::lazy_static;
14use url::Url;
15
16use crate::types::ParserResult;
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
24pub struct Pagination {
25 pub current_page: Option<u32>,
27 pub total_pages: Option<u32>,
29 pub prev_url: Option<String>,
31 pub next_url: Option<String>,
33 pub first_url: Option<String>,
35 pub last_url: Option<String>,
37 pub page_urls: Vec<PageUrl>,
39 pub pagination_type: PaginationType,
41 pub has_infinite_scroll: bool,
43 pub has_load_more: bool,
45 pub items_per_page: Option<u32>,
47 pub total_items: Option<u32>,
49}
50
51impl Pagination {
52 pub fn new() -> Self {
53 Self::default()
54 }
55
56 pub fn has_pagination(&self) -> bool {
58 self.prev_url.is_some() ||
59 self.next_url.is_some() ||
60 !self.page_urls.is_empty() ||
61 self.has_infinite_scroll ||
62 self.has_load_more
63 }
64
65 pub fn has_next(&self) -> bool {
67 self.next_url.is_some()
68 }
69
70 pub fn has_prev(&self) -> bool {
72 self.prev_url.is_some()
73 }
74
75 pub fn is_first_page(&self) -> bool {
77 self.prev_url.is_none() && self.current_page.map(|p| p <= 1).unwrap_or(true)
78 }
79
80 pub fn is_last_page(&self) -> bool {
82 self.next_url.is_none() &&
83 self.current_page.is_some() &&
84 self.total_pages.is_some() &&
85 self.current_page == self.total_pages
86 }
87
88 pub fn all_page_urls(&self) -> Vec<String> {
90 let mut urls: Vec<String> = self.page_urls.iter()
91 .map(|p| p.url.clone())
92 .collect();
93
94 if let Some(ref url) = self.first_url {
95 if !urls.contains(url) {
96 urls.insert(0, url.clone());
97 }
98 }
99 if let Some(ref url) = self.last_url {
100 if !urls.contains(url) {
101 urls.push(url.clone());
102 }
103 }
104
105 urls
106 }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
111pub struct PageUrl {
112 pub url: String,
113 pub page_number: Option<u32>,
114 pub is_current: bool,
115}
116
117#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
119pub enum PaginationType {
120 Numbered,
122 NextPrev,
124 InfiniteScroll,
126 LoadMore,
128 Cursor,
130 Offset,
132 #[default]
134 None,
135}
136
137lazy_static! {
142 static ref PAGE_QUERY_PATTERN: Regex = Regex::new(
144 r"(?i)[?&](page|p|pg|pn|pagenum|pagenumber|offset|start|from)=(\d+)"
145 ).unwrap();
146
147 static ref PAGE_PATH_PATTERN: Regex = Regex::new(
149 r"(?i)/(?:page|p|pg)/(\d+)"
150 ).unwrap();
151
152 static ref PAGE_END_PATTERN: Regex = Regex::new(
154 r"/(\d+)/?$"
155 ).unwrap();
156
157 static ref SHOWING_PATTERN: Regex = Regex::new(
159 r"(?i)(?:showing|displaying)\s+(?:\d+[-–]\d+|\d+)\s+(?:of|from)\s+(\d+)"
160 ).unwrap();
161
162 static ref PAGE_OF_PATTERN: Regex = Regex::new(
164 r"(?i)page\s+(\d+)\s+(?:of|/)\s+(\d+)"
165 ).unwrap();
166
167 static ref ITEMS_PER_PAGE_PATTERN: Regex = Regex::new(
169 r"(?i)(\d+)\s+(?:per\s+page|results|items)"
170 ).unwrap();
171}
172
173pub fn extract_pagination(document: &Html, base_url: Option<&Url>) -> ParserResult<Pagination> {
179 let mut pagination = Pagination::new();
180
181 extract_rel_links(document, &mut pagination, base_url);
183
184 extract_page_links(document, &mut pagination, base_url);
186
187 extract_page_info_from_text(document, &mut pagination);
189
190 pagination.has_infinite_scroll = detect_infinite_scroll(document);
192
193 pagination.has_load_more = detect_load_more(document);
195
196 pagination.pagination_type = determine_pagination_type(&pagination);
198
199 Ok(pagination)
200}
201
202fn extract_rel_links(document: &Html, pagination: &mut Pagination, base_url: Option<&Url>) {
204 if let Ok(sel) = Selector::parse("link[rel='next'], a[rel='next']") {
206 if let Some(el) = document.select(&sel).next() {
207 if let Some(href) = el.value().attr("href") {
208 pagination.next_url = resolve_url(href, base_url);
209 }
210 }
211 }
212
213 if let Ok(sel) = Selector::parse("link[rel='prev'], link[rel='previous'], a[rel='prev'], a[rel='previous']") {
215 if let Some(el) = document.select(&sel).next() {
216 if let Some(href) = el.value().attr("href") {
217 pagination.prev_url = resolve_url(href, base_url);
218 }
219 }
220 }
221
222 if let Ok(sel) = Selector::parse("link[rel='first'], a[rel='first']") {
224 if let Some(el) = document.select(&sel).next() {
225 if let Some(href) = el.value().attr("href") {
226 pagination.first_url = resolve_url(href, base_url);
227 }
228 }
229 }
230
231 if let Ok(sel) = Selector::parse("link[rel='last'], a[rel='last']") {
233 if let Some(el) = document.select(&sel).next() {
234 if let Some(href) = el.value().attr("href") {
235 pagination.last_url = resolve_url(href, base_url);
236 }
237 }
238 }
239}
240
241fn extract_page_links(document: &Html, pagination: &mut Pagination, base_url: Option<&Url>) {
243 let pagination_selectors = [
245 ".pagination a",
246 ".pager a",
247 ".page-numbers a",
248 ".pages a",
249 "nav.pagination a",
250 "[class*='pagination'] a",
251 "[class*='pager'] a",
252 "[aria-label='pagination'] a",
253 "[role='navigation'] a[href*='page']",
254 ];
255
256 let mut seen_urls = std::collections::HashSet::new();
257
258 for selector_str in pagination_selectors {
259 if let Ok(sel) = Selector::parse(selector_str) {
260 for el in document.select(&sel) {
261 if let Some(href) = el.value().attr("href") {
262 let resolved = resolve_url(href, base_url);
263
264 if let Some(ref url) = resolved {
265 if seen_urls.contains(url) {
266 continue;
267 }
268 seen_urls.insert(url.clone());
269
270 let page_number = extract_page_number_from_url(url)
272 .or_else(|| {
273 let text = el.text().collect::<String>().trim().to_string();
275 text.parse::<u32>().ok()
276 });
277
278 let is_current = el.value().classes().any(|c|
280 c.contains("current") || c.contains("active") || c.contains("selected")
281 ) || el.value().attr("aria-current").is_some();
282
283 let page_url = PageUrl {
284 url: url.clone(),
285 page_number,
286 is_current,
287 };
288
289 if is_current {
291 pagination.current_page = page_number;
292 }
293
294 pagination.page_urls.push(page_url);
295 }
296 }
297 }
298 }
299 }
300
301 pagination.page_urls.sort_by(|a, b| {
303 match (a.page_number, b.page_number) {
304 (Some(a), Some(b)) => a.cmp(&b),
305 (Some(_), None) => std::cmp::Ordering::Less,
306 (None, Some(_)) => std::cmp::Ordering::Greater,
307 (None, None) => std::cmp::Ordering::Equal,
308 }
309 });
310
311 pagination.page_urls.dedup_by(|a, b| a.url == b.url);
313
314 if let Some(max) = pagination.page_urls.iter().filter_map(|p| p.page_number).max() {
316 if pagination.total_pages.is_none() {
317 pagination.total_pages = Some(max);
318 }
319 }
320}
321
322pub fn extract_page_number_from_url(url: &str) -> Option<u32> {
324 if let Some(caps) = PAGE_QUERY_PATTERN.captures(url) {
326 if let Some(num) = caps.get(2) {
327 return num.as_str().parse().ok();
328 }
329 }
330
331 if let Some(caps) = PAGE_PATH_PATTERN.captures(url) {
333 if let Some(num) = caps.get(1) {
334 return num.as_str().parse().ok();
335 }
336 }
337
338 if let Some(caps) = PAGE_END_PATTERN.captures(url) {
340 if let Some(num) = caps.get(1) {
341 return num.as_str().parse().ok();
342 }
343 }
344
345 None
346}
347
348fn extract_page_info_from_text(document: &Html, pagination: &mut Pagination) {
350 let body_text = document.root_element().text().collect::<String>();
351
352 if let Some(caps) = PAGE_OF_PATTERN.captures(&body_text) {
354 if let (Some(current), Some(total)) = (caps.get(1), caps.get(2)) {
355 if pagination.current_page.is_none() {
356 pagination.current_page = current.as_str().parse().ok();
357 }
358 if pagination.total_pages.is_none() {
359 pagination.total_pages = total.as_str().parse().ok();
360 }
361 }
362 }
363
364 if let Some(caps) = SHOWING_PATTERN.captures(&body_text) {
366 if let Some(total) = caps.get(1) {
367 pagination.total_items = total.as_str().parse().ok();
368 }
369 }
370
371 if let Some(caps) = ITEMS_PER_PAGE_PATTERN.captures(&body_text) {
373 if let Some(per_page) = caps.get(1) {
374 pagination.items_per_page = per_page.as_str().parse().ok();
375 }
376 }
377}
378
379fn detect_infinite_scroll(document: &Html) -> bool {
381 let html = document.html().to_lowercase();
382
383 html.contains("infinite-scroll") ||
385 html.contains("infinitescroll") ||
386 html.contains("infinite_scroll") ||
387 html.contains("data-infinite") ||
388 html.contains("waypoint") ||
389 html.contains("scroll-trigger") ||
390 html.contains("lazy-load") && html.contains("scroll")
391}
392
393fn detect_load_more(document: &Html) -> bool {
395 let load_more_selectors = [
396 "button[class*='load-more']",
397 "button[class*='loadmore']",
398 "a[class*='load-more']",
399 "a[class*='loadmore']",
400 "[class*='show-more']",
401 "[class*='showmore']",
402 "[data-action='load-more']",
403 ];
404
405 for selector_str in load_more_selectors {
406 if let Ok(sel) = Selector::parse(selector_str) {
407 if document.select(&sel).next().is_some() {
408 return true;
409 }
410 }
411 }
412
413 if let Ok(sel) = Selector::parse("button, a.btn, a.button") {
415 for el in document.select(&sel) {
416 let text = el.text().collect::<String>().to_lowercase();
417 if text.contains("load more") ||
418 text.contains("show more") ||
419 text.contains("view more") ||
420 text.contains("see more") {
421 return true;
422 }
423 }
424 }
425
426 false
427}
428
429fn determine_pagination_type(pagination: &Pagination) -> PaginationType {
431 if pagination.has_infinite_scroll {
432 return PaginationType::InfiniteScroll;
433 }
434
435 if pagination.has_load_more {
436 return PaginationType::LoadMore;
437 }
438
439 if !pagination.page_urls.is_empty() {
440 return PaginationType::Numbered;
441 }
442
443 if pagination.next_url.is_some() || pagination.prev_url.is_some() {
444 return PaginationType::NextPrev;
445 }
446
447 PaginationType::None
448}
449
450fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
452 if href.starts_with("http://") || href.starts_with("https://") {
453 return Some(href.to_string());
454 }
455
456 if href.starts_with("//") {
457 return Some(format!("https:{}", href));
458 }
459
460 if let Some(base) = base_url {
461 return base.join(href).ok().map(|u| u.to_string());
462 }
463
464 None
465}
466
467pub fn has_pagination(document: &Html) -> bool {
473 extract_pagination(document, None)
474 .map(|p| p.has_pagination())
475 .unwrap_or(false)
476}
477
478pub fn get_next_page(document: &Html, base_url: Option<&Url>) -> Option<String> {
480 extract_pagination(document, base_url)
481 .ok()
482 .and_then(|p| p.next_url)
483}
484
485pub fn get_prev_page(document: &Html, base_url: Option<&Url>) -> Option<String> {
487 extract_pagination(document, base_url)
488 .ok()
489 .and_then(|p| p.prev_url)
490}
491
492pub fn generate_page_url(base_url: &str, page_number: u32, pattern: &str) -> String {
494 pattern.replace("{page}", &page_number.to_string())
495 .replace("{url}", base_url)
496}
497
498#[cfg(test)]
503mod tests {
504 use super::*;
505
506 fn parse_html(html: &str) -> Html {
507 Html::parse_document(html)
508 }
509
510 #[test]
511 fn test_extract_rel_next_prev() {
512 let html = r#"
513 <html>
514 <head>
515 <link rel="prev" href="/page/1">
516 <link rel="next" href="/page/3">
517 </head>
518 <body></body>
519 </html>
520 "#;
521
522 let doc = parse_html(html);
523 let base = Url::parse("https://example.com/page/2").unwrap();
524 let pagination = extract_pagination(&doc, Some(&base)).unwrap();
525
526 assert_eq!(pagination.prev_url, Some("https://example.com/page/1".to_string()));
527 assert_eq!(pagination.next_url, Some("https://example.com/page/3".to_string()));
528 assert!(pagination.has_pagination());
529 }
530
531 #[test]
532 fn test_extract_numbered_pagination() {
533 let html = r#"
534 <div class="pagination">
535 <a href="/page/1">1</a>
536 <a href="/page/2" class="active">2</a>
537 <a href="/page/3">3</a>
538 <a href="/page/4">4</a>
539 </div>
540 "#;
541
542 let doc = parse_html(html);
543 let base = Url::parse("https://example.com/page/2").unwrap();
544 let pagination = extract_pagination(&doc, Some(&base)).unwrap();
545
546 assert_eq!(pagination.page_urls.len(), 4);
547 assert_eq!(pagination.current_page, Some(2));
548 assert_eq!(pagination.total_pages, Some(4));
549 assert_eq!(pagination.pagination_type, PaginationType::Numbered);
550 }
551
552 #[test]
553 fn test_detect_infinite_scroll() {
554 let html = r#"
555 <html>
556 <body>
557 <div class="infinite-scroll" data-infinite="true">
558 Content here
559 </div>
560 </body>
561 </html>
562 "#;
563
564 let doc = parse_html(html);
565 let pagination = extract_pagination(&doc, None).unwrap();
566
567 assert!(pagination.has_infinite_scroll);
568 assert_eq!(pagination.pagination_type, PaginationType::InfiniteScroll);
569 }
570
571 #[test]
572 fn test_detect_load_more() {
573 let html = r#"
574 <html>
575 <body>
576 <div class="items">Items...</div>
577 <button class="load-more">Load More</button>
578 </body>
579 </html>
580 "#;
581
582 let doc = parse_html(html);
583 let pagination = extract_pagination(&doc, None).unwrap();
584
585 assert!(pagination.has_load_more);
586 }
587
588 #[test]
589 fn test_extract_page_number_from_url() {
590 assert_eq!(extract_page_number_from_url("/articles?page=5"), Some(5));
591 assert_eq!(extract_page_number_from_url("/blog/page/3"), Some(3));
592 assert_eq!(extract_page_number_from_url("/posts?p=10"), Some(10));
593 assert_eq!(extract_page_number_from_url("/items?offset=20"), Some(20));
594 assert_eq!(extract_page_number_from_url("/no-page-here"), None);
595 }
596
597 #[test]
598 fn test_page_of_text_detection() {
599 let html = r#"
600 <html>
601 <body>
602 <p>Page 3 of 10</p>
603 <p>Showing 21-30 of 100 results</p>
604 </body>
605 </html>
606 "#;
607
608 let doc = parse_html(html);
609 let pagination = extract_pagination(&doc, None).unwrap();
610
611 assert_eq!(pagination.current_page, Some(3));
612 assert_eq!(pagination.total_pages, Some(10));
613 assert_eq!(pagination.total_items, Some(100));
614 }
615
616 #[test]
617 fn test_is_first_last_page() {
618 let mut pagination = Pagination::new();
619 pagination.current_page = Some(1);
620 pagination.total_pages = Some(5);
621
622 assert!(pagination.is_first_page());
623 assert!(!pagination.is_last_page());
624
625 pagination.current_page = Some(5);
626 assert!(!pagination.is_first_page());
627 assert!(pagination.is_last_page());
628 }
629
630 #[test]
631 fn test_all_page_urls() {
632 let mut pagination = Pagination::new();
633 pagination.first_url = Some("/page/1".to_string());
634 pagination.last_url = Some("/page/5".to_string());
635 pagination.page_urls = vec![
636 PageUrl { url: "/page/2".to_string(), page_number: Some(2), is_current: false },
637 PageUrl { url: "/page/3".to_string(), page_number: Some(3), is_current: true },
638 ];
639
640 let all_urls = pagination.all_page_urls();
641 assert_eq!(all_urls.len(), 4);
642 assert_eq!(all_urls[0], "/page/1");
643 assert_eq!(all_urls[3], "/page/5");
644 }
645
646 #[test]
647 fn test_generate_page_url() {
648 let url = generate_page_url("https://example.com", 5, "{url}/page/{page}");
649 assert_eq!(url, "https://example.com/page/5");
650 }
651
652 #[test]
653 fn test_no_pagination() {
654 let html = "<html><body><p>Just content, no pagination</p></body></html>";
655 let doc = parse_html(html);
656 let pagination = extract_pagination(&doc, None).unwrap();
657
658 assert!(!pagination.has_pagination());
659 assert_eq!(pagination.pagination_type, PaginationType::None);
660 }
661
662 #[test]
663 fn test_load_more_text_button() {
664 let html = r#"
665 <html>
666 <body>
667 <button class="btn">Load more items</button>
668 </body>
669 </html>
670 "#;
671
672 let doc = parse_html(html);
673 let pagination = extract_pagination(&doc, None).unwrap();
674
675 assert!(pagination.has_load_more);
676 }
677
678 #[test]
679 fn test_aria_pagination() {
680 let html = r#"
681 <nav aria-label="pagination">
682 <a href="/page/1">1</a>
683 <a href="/page/2" aria-current="page">2</a>
684 <a href="/page/3">3</a>
685 </nav>
686 "#;
687
688 let doc = parse_html(html);
689 let base = Url::parse("https://example.com/").unwrap();
690 let pagination = extract_pagination(&doc, Some(&base)).unwrap();
691
692 assert!(!pagination.page_urls.is_empty());
693 }
694}