use crate::error::{Result, ScrapeError};
use regex::Regex;
use scraper::{Html, Selector};
use std::collections::{HashMap, HashSet};
use tracing::{debug, trace};
use url::Url;
#[derive(Debug, Clone)]
pub struct PaginationConfig {
pub max_pages: usize,
pub max_depth: usize,
pub detect_circular: bool,
}
impl Default for PaginationConfig {
fn default() -> Self {
Self {
max_pages: 50,
max_depth: 10,
detect_circular: true,
}
}
}
pub struct PaginationDetector {
config: PaginationConfig,
visited_pagination: HashSet<String>,
pagination_depth: HashMap<String, usize>,
}
impl PaginationDetector {
pub fn new(config: PaginationConfig) -> Self {
Self {
config,
visited_pagination: HashSet::new(),
pagination_depth: HashMap::new(),
}
}
pub fn new_default() -> Self {
Self::new(PaginationConfig::default())
}
pub fn detect_pagination(&mut self, html: &str, current_url: &str) -> Vec<String> {
if !self.visited_pagination.contains(current_url) {
self.visited_pagination.insert(current_url.to_string());
let current_depth = self.pagination_depth.get(current_url).copied().unwrap_or(0);
self.pagination_depth.insert(current_url.to_string(), current_depth);
}
if self.visited_pagination.len() >= self.config.max_pages {
debug!(
"Reached max pagination pages ({}), stopping pagination detection",
self.config.max_pages
);
return vec![];
}
let current_depth = self.pagination_depth.get(current_url).copied().unwrap_or(0);
if current_depth >= self.config.max_depth {
debug!(
"Reached max pagination depth ({}) for {}, stopping pagination detection",
self.config.max_depth, current_url
);
return vec![];
}
if Self::is_last_page(html) {
debug!("Detected last page indicator in HTML, stopping pagination");
return vec![];
}
let mut pages = HashSet::new();
if let Some(next_url) = Self::find_rel_next(html, current_url) {
trace!("Found rel=next pagination link: {}", next_url);
pages.insert(next_url);
}
for next_url in Self::find_next_text_links(html, current_url) {
trace!("Found 'Next' text pagination link: {}", next_url);
pages.insert(next_url);
}
if let Some(next_url) = Self::detect_url_pattern(current_url) {
trace!("Detected URL pattern pagination: {}", next_url);
pages.insert(next_url);
}
for page_url in Self::find_numbered_links(html, current_url) {
trace!("Found numbered pagination link: {}", page_url);
pages.insert(page_url);
}
let new_pages: Vec<String> = pages.into_iter()
.filter(|url| {
if self.visited_pagination.contains(url) {
debug!("Skipping already visited pagination: {}", url);
return false;
}
if self.config.detect_circular && Self::is_circular_pagination(url, current_url) {
debug!("Detected circular pagination, skipping: {}", url);
return false;
}
true
})
.collect();
for page in &new_pages {
self.pagination_depth.insert(page.clone(), current_depth + 1);
self.visited_pagination.insert(page.clone());
}
if !new_pages.is_empty() {
debug!(
"Detected {} new pagination link(s) from {} (depth: {}, total visited: {})",
new_pages.len(),
current_url,
current_depth,
self.visited_pagination.len()
);
}
new_pages
}
fn is_last_page(html: &str) -> bool {
let document = Html::parse_document(html);
if let Ok(sel) = Selector::parse(r#"a[rel="next"][disabled], button.next[disabled], .next[disabled]"#) {
if document.select(&sel).next().is_some() {
return true;
}
}
if let Ok(sel) = Selector::parse(r#"[aria-label*="next" i][aria-disabled="true"], [aria-label*="next" i][disabled]"#) {
if document.select(&sel).next().is_some() {
return true;
}
}
if let Ok(sel) = Selector::parse(r#".next.disabled, .pager-next.disabled, .pagination-next.disabled"#) {
if document.select(&sel).next().is_some() {
return true;
}
}
false
}
fn is_circular_pagination(next_url: &str, current_url: &str) -> bool {
let next_page_num = Self::extract_page_number(next_url);
let current_page_num = Self::extract_page_number(current_url);
if let (Some(next_num), Some(current_num)) = (next_page_num, current_page_num) {
if next_num <= current_num {
return true; }
}
false
}
fn extract_page_number(url: &str) -> Option<u32> {
if let Ok(re) = Regex::new(r"/page/(\d+)") {
if let Some(caps) = re.captures(url) {
if let Some(num_str) = caps.get(1) {
if let Ok(num) = num_str.as_str().parse() {
return Some(num);
}
}
}
}
if let Ok(re) = Regex::new(r"[?&]page=(\d+)") {
if let Some(caps) = re.captures(url) {
if let Some(num_str) = caps.get(1) {
if let Ok(num) = num_str.as_str().parse() {
return Some(num);
}
}
}
}
if let Ok(re) = Regex::new(r"[?&]p=(\d+)") {
if let Some(caps) = re.captures(url) {
if let Some(num_str) = caps.get(1) {
if let Ok(num) = num_str.as_str().parse() {
return Some(num);
}
}
}
}
None
}
fn find_rel_next(html: &str, base_url: &str) -> Option<String> {
let document = Html::parse_document(html);
let selector = Selector::parse(r#"a[rel="next"], link[rel="next"]"#).ok()?;
for element in document.select(&selector) {
if let Some(href) = element.value().attr("href") {
if let Ok(url) = Self::resolve_url(base_url, href) {
return Some(url);
}
}
}
None
}
fn find_next_text_links(html: &str, base_url: &str) -> Vec<String> {
let document = Html::parse_document(html);
let selector = match Selector::parse("a") {
Ok(s) => s,
Err(_) => return Vec::new(),
};
let mut links = Vec::new();
let next_patterns = [
"next",
"→",
"»",
"›",
"→",
"»",
"›",
"→",
"»",
"›",
];
for element in document.select(&selector) {
let text = element.text().collect::<String>().to_lowercase();
let text_trimmed = text.trim();
let is_next = next_patterns
.iter()
.any(|pattern| text_trimmed.contains(pattern));
let has_next_class = element
.value()
.attr("class")
.map(|c| c.contains("next") || c.contains("pager"))
.unwrap_or(false);
if is_next || has_next_class {
if let Some(href) = element.value().attr("href") {
if href.starts_with("javascript:") || href == "#" {
continue;
}
if let Ok(url) = Self::resolve_url(base_url, href) {
links.push(url);
}
}
}
}
links
}
fn detect_url_pattern(current_url: &str) -> Option<String> {
let current = Url::parse(current_url).ok()?;
let page_path_regex = Regex::new(r"/page/(\d+)/?").ok()?;
if let Some(caps) = page_path_regex.captures(current.path()) {
let current_page: u32 = caps[1].parse().ok()?;
let next_page = current_page + 1;
let next_url = current_url.replace(
&format!("/page/{}/", current_page),
&format!("/page/{}/", next_page),
);
let next_url = if next_url == current_url {
current_url.replace(
&format!("/page/{}", current_page),
&format!("/page/{}", next_page),
)
} else {
next_url
};
return Some(next_url);
}
let query_page_regex = Regex::new(r"[?&]page=(\d+)").ok()?;
if let Some(caps) = query_page_regex.captures(current_url) {
let current_page: u32 = caps[1].parse().ok()?;
let next_page = current_page + 1;
let next_url = current_url.replace(
&format!("page={}", current_page),
&format!("page={}", next_page),
);
return Some(next_url);
}
let query_p_regex = Regex::new(r"[?&]p=(\d+)").ok()?;
if let Some(caps) = query_p_regex.captures(current_url) {
let current_page: u32 = caps[1].parse().ok()?;
let next_page = current_page + 1;
let next_url =
current_url.replace(&format!("p={}", current_page), &format!("p={}", next_page));
return Some(next_url);
}
None
}
fn find_numbered_links(html: &str, base_url: &str) -> Vec<String> {
let document = Html::parse_document(html);
let selector = match Selector::parse("a") {
Ok(s) => s,
Err(_) => return Vec::new(),
};
let mut links = Vec::new();
let number_regex = match Regex::new(r"^\s*(\d+)\s*$") {
Ok(r) => r,
Err(_) => return Vec::new(),
};
let current_page_num = Self::extract_current_page_number(base_url);
for element in document.select(&selector) {
let text = element.text().collect::<String>();
let text_trimmed = text.trim();
if let Some(caps) = number_regex.captures(text_trimmed) {
let page_num: u32 = match caps[1].parse() {
Ok(n) => n,
Err(_) => continue,
};
if let Some(current) = current_page_num {
if page_num <= current {
continue;
}
}
if let Some(href) = element.value().attr("href") {
if let Ok(url) = Self::resolve_url(base_url, href) {
links.push(url);
}
}
}
}
links
}
fn extract_current_page_number(url: &str) -> Option<u32> {
let page_path_regex = Regex::new(r"/page/(\d+)").ok()?;
if let Some(caps) = page_path_regex.captures(url) {
return caps[1].parse().ok();
}
let query_page_regex = Regex::new(r"[?&]page=(\d+)").ok()?;
if let Some(caps) = query_page_regex.captures(url) {
return caps[1].parse().ok();
}
let query_p_regex = Regex::new(r"[?&]p=(\d+)").ok()?;
if let Some(caps) = query_p_regex.captures(url) {
return caps[1].parse().ok();
}
None
}
fn resolve_url(base: &str, relative: &str) -> Result<String> {
let base_url = Url::parse(base)
.map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;
let resolved = base_url
.join(relative)
.map_err(|e| ScrapeError::InvalidUrl(format!("Failed to resolve URL: {}", e)))?;
Ok(resolved.to_string())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_rel_next() {
let html = r#"<a href="/page/2/" rel="next">Next</a>"#;
let mut detector = PaginationDetector::new_default();
let pages = detector.detect_pagination(html, "https://example.com/page/1/");
assert!(!pages.is_empty());
assert!(pages.contains(&"https://example.com/page/2/".to_string()));
}
#[test]
fn test_detect_next_text_link() {
let html = r#"<a href="/page/2/">Next →</a>"#;
let mut detector = PaginationDetector::new_default();
let pages = detector.detect_pagination(html, "https://example.com/page/1/");
assert!(!pages.is_empty());
assert!(pages.contains(&"https://example.com/page/2/".to_string()));
}
#[test]
fn test_detect_page_pattern() {
let html = "";
let mut detector = PaginationDetector::new_default();
let pages = detector.detect_pagination(html, "https://quotes.toscrape.com/page/1/");
assert!(!pages.is_empty());
assert!(pages.iter().any(|p| p.contains("/page/2/")));
}
#[test]
fn test_detect_query_page_pattern() {
let html = "";
let mut detector = PaginationDetector::new_default();
let pages = detector.detect_pagination(html, "https://example.com/search?page=1");
assert!(!pages.is_empty());
assert!(pages.contains(&"https://example.com/search?page=2".to_string()));
}
#[test]
fn test_numbered_pagination() {
let html = r#"
<nav>
<a href="/page/1/">1</a>
<a href="/page/2/">2</a>
<a href="/page/3/">3</a>
</nav>
"#;
let mut detector = PaginationDetector::new_default();
let pages = detector.detect_pagination(html, "https://example.com/page/1/");
assert!(pages.len() >= 2);
assert!(pages.contains(&"https://example.com/page/2/".to_string()));
assert!(pages.contains(&"https://example.com/page/3/".to_string()));
}
#[test]
fn test_quotes_toscrape_pagination() {
let html = r#"
<nav>
<ul class="pager">
<li class="next">
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
</li>
</ul>
</nav>
"#;
let mut detector = PaginationDetector::new_default();
let pages = detector.detect_pagination(html, "https://quotes.toscrape.com/page/1/");
assert!(!pages.is_empty());
assert!(pages.contains(&"https://quotes.toscrape.com/page/2/".to_string()));
}
#[test]
fn test_no_pagination_on_last_page() {
let html = r#"
<nav>
<ul class="pager">
<li class="previous">
<a href="/page/9/">Previous</a>
</li>
</ul>
</nav>
"#;
let mut detector = PaginationDetector::new_default();
let pages = detector.detect_pagination(html, "https://quotes.toscrape.com/page/10/");
assert!(!pages.is_empty());
}
#[test]
fn test_extract_page_number() {
assert_eq!(
PaginationDetector::extract_page_number("https://example.com/page/5/"),
Some(5)
);
assert_eq!(
PaginationDetector::extract_page_number("https://example.com?page=3"),
Some(3)
);
assert_eq!(
PaginationDetector::extract_page_number("https://example.com?p=7"),
Some(7)
);
assert_eq!(
PaginationDetector::extract_page_number("https://example.com/about"),
None
);
}
#[test]
fn test_is_circular_pagination_logic() {
assert!(PaginationDetector::is_circular_pagination(
"https://example.com/page/1/",
"https://example.com/page/2/"
));
assert!(PaginationDetector::is_circular_pagination(
"https://example.com/page/2/",
"https://example.com/page/2/"
));
assert!(!PaginationDetector::is_circular_pagination(
"https://example.com/page/3/",
"https://example.com/page/2/"
));
}
#[test]
fn test_pagination_limit_enforced() {
let mut detector = PaginationDetector::new(PaginationConfig {
max_pages: 5,
max_depth: 10,
detect_circular: true,
});
for i in 1..=10 {
let html = format!(r#"<a href="/page/{}/" rel="next">Next</a>"#, i + 1);
let pages = detector.detect_pagination(&html, &format!("https://example.com/page/{}/", i));
if i >= 5 {
assert_eq!(pages.len(), 0, "Expected no pages after limit at iteration {}", i);
}
}
}
#[test]
fn test_circular_pagination_detected() {
let mut detector = PaginationDetector::new_default();
let html = r#"<a href="/articles?page=2" rel="next">Next</a>"#;
let pages = detector.detect_pagination(html, "https://example.com/articles?page=1");
assert_eq!(pages.len(), 1);
assert!(pages.contains(&"https://example.com/articles?page=2".to_string()));
let html = r#"<a href="/articles?page=1" rel="next">Back</a>"#;
let pages = detector.detect_pagination(html, "https://example.com/articles?page=2");
assert!(!pages.contains(&"https://example.com/articles?page=1".to_string()),
"Page 1 should be filtered out (circular/already visited)");
}
#[test]
fn test_last_page_detection() {
let html = r#"<button class="next disabled">Next</button>"#;
assert!(PaginationDetector::is_last_page(html));
let html2 = r#"<a href="/page/2/" class="next" disabled>Next</a>"#;
assert!(PaginationDetector::is_last_page(html2));
let html3 = r#"<a aria-label="Next page" aria-disabled="true">Next</a>"#;
assert!(PaginationDetector::is_last_page(html3));
}
#[test]
fn test_depth_limit_enforced() {
let mut detector = PaginationDetector::new(PaginationConfig {
max_pages: 100,
max_depth: 2, detect_circular: true,
});
let mut current_url = "https://example.com/page/1/".to_string();
for iteration in 1..=5 {
let html = format!(r#"<a href="/page/{}/" rel="next">Next</a>"#, iteration + 1);
let pages = detector.detect_pagination(&html, ¤t_url);
if iteration > 2 {
assert_eq!(pages.len(), 0,
"Expected no pages after depth limit at iteration {} (current_url={}, internal_depth={})",
iteration, current_url, detector.pagination_depth.get(¤t_url).unwrap_or(&0));
} else {
assert!(!pages.is_empty(), "Expected pages before depth limit at iteration {}", iteration);
current_url = pages[0].clone();
}
}
}
#[test]
fn test_visited_deduplication() {
let mut detector = PaginationDetector::new_default();
let html = r#"<a href="/page/2/" rel="next">Next</a>"#;
let pages1 = detector.detect_pagination(html, "https://example.com/page/1/");
assert_eq!(pages1.len(), 1);
let html = r#"<a href="/page/2/" rel="next">Next</a>"#;
let pages2 = detector.detect_pagination(html, "https://example.com/page/1/");
assert_eq!(pages2.len(), 0);
}
}