Skip to main content

scrapling_browser/
page_pool.rs

1//! Thread-safe browser page pool for tracking concurrent page usage.
2//!
3//! When a session is configured with `max_pages > 1`, multiple pages can be open
4//! simultaneously. The [`PagePool`] keeps track of how many pages exist, which ones
5//! are busy, and enforces the capacity limit so the browser does not consume
6//! unbounded resources.
7//!
8//! Each page is identified by a zero-based `page_index` and transitions through
9//! three states defined by [`PageState`]:
10//!
11//! ```text
12//!   Ready ──▶ Busy ──▶ Ready   (successful navigation)
13//!                  └──▶ Error   (unrecoverable failure)
14//! ```
15//!
16//! Pages in the `Error` state can be cleaned up with [`PagePool::cleanup_error_pages`].
17//! The [`PoolStats`] snapshot is useful for monitoring and logging.
18
19use std::sync::Mutex;
20
21/// Lifecycle state of a pooled browser page.
22///
23/// Pages start in `Ready`, transition to `Busy` during navigation, and return to
24/// `Ready` on success or move to `Error` on unrecoverable failure.
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum PageState {
27    /// The page is idle and available for a new navigation.
28    /// A page returns to this state after a successful fetch cycle.
29    Ready,
30
31    /// The page is currently performing a navigation or action.
32    /// While in this state the page should not be reused by another fetch.
33    Busy,
34
35    /// The page encountered an unrecoverable error and should be discarded.
36    /// Call [`PagePool::cleanup_error_pages`] to remove all errored pages.
37    Error,
38}
39
40/// Metadata tracked for each page in the pool.
41///
42/// Stores the page's index, current lifecycle state, and the URL it is navigating
43/// to (if any). The pool uses this information to enforce capacity limits and to
44/// report statistics via [`PoolStats`].
45#[derive(Debug)]
46pub struct PageInfo {
47    /// Zero-based index identifying this page within the pool.
48    pub page_index: usize,
49    /// Current lifecycle state of the page.
50    pub state: PageState,
51    /// URL the page is currently navigated to, or empty if idle.
52    pub url: String,
53}
54
55impl PageInfo {
56    /// Create a new `PageInfo` in the `Ready` state with the given index.
57    /// The URL is initially empty and will be set when the page starts navigating.
58    pub fn new(page_index: usize) -> Self {
59        Self {
60            page_index,
61            state: PageState::Ready,
62            url: String::new(),
63        }
64    }
65}
66
67/// Thread-safe pool that tracks browser page states and enforces a capacity limit.
68///
69/// The pool uses a `Mutex<Vec<PageInfo>>` internally, making it safe to share across
70/// async tasks. All mutation methods acquire the lock, perform the update, and release
71/// it immediately to minimise contention.
72pub struct PagePool {
73    /// Maximum number of pages allowed in the pool.
74    pub max_pages: u32,
75    pages: Mutex<Vec<PageInfo>>,
76}
77
78impl PagePool {
79    /// Create an empty page pool with the given capacity.
80    /// No pages are registered yet -- call [`add_page`](Self::add_page) to register
81    /// each new page as it is created by the session.
82    pub fn new(max_pages: u32) -> Self {
83        Self {
84            max_pages,
85            pages: Mutex::new(Vec::new()),
86        }
87    }
88
89    /// Register a new page in the pool, returning an error if the pool is full.
90    /// The page starts in the [`PageState::Ready`] state. Returns
91    /// [`BrowserError::PagePool`] when the number of registered pages already
92    /// equals `max_pages`.
93    pub fn add_page(&self, page_index: usize) -> crate::error::Result<()> {
94        let mut pages = self.pages.lock().unwrap();
95        if pages.len() >= self.max_pages as usize {
96            return Err(crate::error::BrowserError::PagePool(format!(
97                "page pool full ({}/{})",
98                pages.len(),
99                self.max_pages
100            )));
101        }
102        pages.push(PageInfo::new(page_index));
103        Ok(())
104    }
105
106    fn with_page(&self, page_index: usize, f: impl FnOnce(&mut PageInfo)) {
107        let mut pages = self.pages.lock().unwrap();
108        if let Some(info) = pages.iter_mut().find(|p| p.page_index == page_index) {
109            f(info);
110        }
111    }
112
113    /// Mark a page as busy and record the URL it is navigating to.
114    /// Call this at the start of a fetch cycle to signal that the page is in use.
115    pub fn mark_busy(&self, page_index: usize, url: &str) {
116        self.with_page(page_index, |info| {
117            info.state = PageState::Busy;
118            info.url = url.to_owned();
119        });
120    }
121
122    /// Mark a page as ready (idle) for reuse.
123    /// Call this after a fetch cycle completes successfully.
124    pub fn mark_ready(&self, page_index: usize) {
125        self.with_page(page_index, |info| info.state = PageState::Ready);
126    }
127
128    /// Mark a page as having encountered an error.
129    /// The page will not be reused until it is cleaned up via
130    /// [`cleanup_error_pages`](Self::cleanup_error_pages).
131    pub fn mark_error(&self, page_index: usize) {
132        self.with_page(page_index, |info| info.state = PageState::Error);
133    }
134
135    /// Return the total number of pages currently in the pool (all states combined).
136    pub fn pages_count(&self) -> usize {
137        self.pages.lock().unwrap().len()
138    }
139
140    /// Return the number of pages currently in the `Busy` state.
141    pub fn busy_count(&self) -> usize {
142        self.pages
143            .lock()
144            .unwrap()
145            .iter()
146            .filter(|p| p.state == PageState::Busy)
147            .count()
148    }
149
150    /// Remove all pages in the `Error` state from the pool.
151    /// This frees up capacity so new pages can be registered. You should call this
152    /// periodically or after a batch of fetches to reclaim slots.
153    pub fn cleanup_error_pages(&self) {
154        self.pages
155            .lock()
156            .unwrap()
157            .retain(|p| p.state != PageState::Error);
158    }
159
160    /// Take a snapshot of the pool's current statistics.
161    /// The snapshot is a point-in-time copy and does not hold the lock after returning.
162    pub fn stats(&self) -> PoolStats {
163        let pages = self.pages.lock().unwrap();
164        PoolStats {
165            total_pages: pages.len(),
166            busy_pages: pages.iter().filter(|p| p.state == PageState::Busy).count(),
167            max_pages: self.max_pages as usize,
168        }
169    }
170}
171
172/// Point-in-time snapshot of page pool utilisation.
173///
174/// Returned by [`PagePool::stats`]. Use this for monitoring, logging, or deciding
175/// whether to wait before starting a new fetch.
176#[derive(Debug, Clone)]
177pub struct PoolStats {
178    /// Number of pages currently tracked in the pool (all states).
179    pub total_pages: usize,
180
181    /// Number of pages currently in the `Busy` state.
182    /// When this equals `max_pages`, no more pages can be created until one finishes.
183    pub busy_pages: usize,
184
185    /// Maximum capacity of the pool as configured in [`BrowserConfig::max_pages`].
186    pub max_pages: usize,
187}