scrapling_browser/page_pool.rs
1//! Thread-safe browser page pool for tracking concurrent page usage.
2//!
3//! When a session is configured with `max_pages > 1`, multiple pages can be open
4//! simultaneously. The [`PagePool`] keeps track of how many pages exist, which ones
5//! are busy, and enforces the capacity limit so the browser does not consume
6//! unbounded resources.
7//!
8//! Each page is identified by a zero-based `page_index` and transitions through
9//! three states defined by [`PageState`]:
10//!
11//! ```text
12//! Ready ──▶ Busy ──▶ Ready (successful navigation)
13//! └──▶ Error (unrecoverable failure)
14//! ```
15//!
16//! Pages in the `Error` state can be cleaned up with [`PagePool::cleanup_error_pages`].
17//! The [`PoolStats`] snapshot is useful for monitoring and logging.
18
19use std::sync::Mutex;
20
21/// Lifecycle state of a pooled browser page.
22///
23/// Pages start in `Ready`, transition to `Busy` during navigation, and return to
24/// `Ready` on success or move to `Error` on unrecoverable failure.
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum PageState {
27 /// The page is idle and available for a new navigation.
28 /// A page returns to this state after a successful fetch cycle.
29 Ready,
30
31 /// The page is currently performing a navigation or action.
32 /// While in this state the page should not be reused by another fetch.
33 Busy,
34
35 /// The page encountered an unrecoverable error and should be discarded.
36 /// Call [`PagePool::cleanup_error_pages`] to remove all errored pages.
37 Error,
38}
39
40/// Metadata tracked for each page in the pool.
41///
42/// Stores the page's index, current lifecycle state, and the URL it is navigating
43/// to (if any). The pool uses this information to enforce capacity limits and to
44/// report statistics via [`PoolStats`].
45#[derive(Debug)]
46pub struct PageInfo {
47 /// Zero-based index identifying this page within the pool.
48 pub page_index: usize,
49 /// Current lifecycle state of the page.
50 pub state: PageState,
51 /// URL the page is currently navigated to, or empty if idle.
52 pub url: String,
53}
54
55impl PageInfo {
56 /// Create a new `PageInfo` in the `Ready` state with the given index.
57 /// The URL is initially empty and will be set when the page starts navigating.
58 pub fn new(page_index: usize) -> Self {
59 Self {
60 page_index,
61 state: PageState::Ready,
62 url: String::new(),
63 }
64 }
65}
66
67/// Thread-safe pool that tracks browser page states and enforces a capacity limit.
68///
69/// The pool uses a `Mutex<Vec<PageInfo>>` internally, making it safe to share across
70/// async tasks. All mutation methods acquire the lock, perform the update, and release
71/// it immediately to minimise contention.
72pub struct PagePool {
73 /// Maximum number of pages allowed in the pool.
74 pub max_pages: u32,
75 pages: Mutex<Vec<PageInfo>>,
76}
77
78impl PagePool {
79 /// Create an empty page pool with the given capacity.
80 /// No pages are registered yet -- call [`add_page`](Self::add_page) to register
81 /// each new page as it is created by the session.
82 pub fn new(max_pages: u32) -> Self {
83 Self {
84 max_pages,
85 pages: Mutex::new(Vec::new()),
86 }
87 }
88
89 /// Register a new page in the pool, returning an error if the pool is full.
90 /// The page starts in the [`PageState::Ready`] state. Returns
91 /// [`BrowserError::PagePool`] when the number of registered pages already
92 /// equals `max_pages`.
93 pub fn add_page(&self, page_index: usize) -> crate::error::Result<()> {
94 let mut pages = self.pages.lock().unwrap();
95 if pages.len() >= self.max_pages as usize {
96 return Err(crate::error::BrowserError::PagePool(format!(
97 "page pool full ({}/{})",
98 pages.len(),
99 self.max_pages
100 )));
101 }
102 pages.push(PageInfo::new(page_index));
103 Ok(())
104 }
105
106 fn with_page(&self, page_index: usize, f: impl FnOnce(&mut PageInfo)) {
107 let mut pages = self.pages.lock().unwrap();
108 if let Some(info) = pages.iter_mut().find(|p| p.page_index == page_index) {
109 f(info);
110 }
111 }
112
113 /// Mark a page as busy and record the URL it is navigating to.
114 /// Call this at the start of a fetch cycle to signal that the page is in use.
115 pub fn mark_busy(&self, page_index: usize, url: &str) {
116 self.with_page(page_index, |info| {
117 info.state = PageState::Busy;
118 info.url = url.to_owned();
119 });
120 }
121
122 /// Mark a page as ready (idle) for reuse.
123 /// Call this after a fetch cycle completes successfully.
124 pub fn mark_ready(&self, page_index: usize) {
125 self.with_page(page_index, |info| info.state = PageState::Ready);
126 }
127
128 /// Mark a page as having encountered an error.
129 /// The page will not be reused until it is cleaned up via
130 /// [`cleanup_error_pages`](Self::cleanup_error_pages).
131 pub fn mark_error(&self, page_index: usize) {
132 self.with_page(page_index, |info| info.state = PageState::Error);
133 }
134
135 /// Return the total number of pages currently in the pool (all states combined).
136 pub fn pages_count(&self) -> usize {
137 self.pages.lock().unwrap().len()
138 }
139
140 /// Return the number of pages currently in the `Busy` state.
141 pub fn busy_count(&self) -> usize {
142 self.pages
143 .lock()
144 .unwrap()
145 .iter()
146 .filter(|p| p.state == PageState::Busy)
147 .count()
148 }
149
150 /// Remove all pages in the `Error` state from the pool.
151 /// This frees up capacity so new pages can be registered. You should call this
152 /// periodically or after a batch of fetches to reclaim slots.
153 pub fn cleanup_error_pages(&self) {
154 self.pages
155 .lock()
156 .unwrap()
157 .retain(|p| p.state != PageState::Error);
158 }
159
160 /// Take a snapshot of the pool's current statistics.
161 /// The snapshot is a point-in-time copy and does not hold the lock after returning.
162 pub fn stats(&self) -> PoolStats {
163 let pages = self.pages.lock().unwrap();
164 PoolStats {
165 total_pages: pages.len(),
166 busy_pages: pages.iter().filter(|p| p.state == PageState::Busy).count(),
167 max_pages: self.max_pages as usize,
168 }
169 }
170}
171
172/// Point-in-time snapshot of page pool utilisation.
173///
174/// Returned by [`PagePool::stats`]. Use this for monitoring, logging, or deciding
175/// whether to wait before starting a new fetch.
176#[derive(Debug, Clone)]
177pub struct PoolStats {
178 /// Number of pages currently tracked in the pool (all states).
179 pub total_pages: usize,
180
181 /// Number of pages currently in the `Busy` state.
182 /// When this equals `max_pages`, no more pages can be created until one finishes.
183 pub busy_pages: usize,
184
185 /// Maximum capacity of the pool as configured in [`BrowserConfig::max_pages`].
186 pub max_pages: usize,
187}