scrapling_spider/session.rs
1//! Named HTTP session management for the crawler.
2//!
3//! The [`SessionManager`] lets a spider register multiple HTTP backends under
4//! string identifiers. Each [`Request`](crate::request::Request) carries an
5//! optional `sid` (session ID) that tells the engine which backend to use for
6//! that particular fetch. This is how you mix plain HTTP fetchers with
7//! cookie-persisting sessions, or route certain domains through different proxy
8//! configurations.
9//!
10//! A session is either a stateless [`Fetcher`] or a stateful [`FetcherSession`]
11//! (which automatically carries cookies and custom headers across requests).
12//! The spider configures its sessions in
13//! [`Spider::configure_sessions`](crate::spider::Spider::configure_sessions).
14//!
15//! ```rust,ignore
16//! use scrapling_spider::session::{Session, SessionManager};
17//! use scrapling_fetch::Fetcher;
18//!
19//! let mut manager = SessionManager::new();
20//! manager.add("default", Session::Fetcher(Fetcher::new()), true).unwrap();
21//! ```
22
23use std::collections::HashMap;
24
25use scrapling_fetch::{Fetcher, FetcherSession, Response};
26use tracing::debug;
27
28use crate::error::{Result, SpiderError};
29use crate::request::Request;
30
31/// A session backend that can be either a stateless fetcher or a stateful session.
32///
33/// The distinction matters for cookie handling: a [`Fetcher`] starts fresh on
34/// every request, while a [`FetcherSession`] preserves cookies, headers, and
35/// other state between requests. Choose the variant that matches your crawling
36/// needs.
37pub enum Session {
38 /// A stateless HTTP fetcher. Each request is independent -- no cookies or
39 /// headers carry over. This is the simplest option and works well for public
40 /// pages that do not require authentication.
41 Fetcher(Fetcher),
42 /// A stateful HTTP session that persists cookies, headers, and other
43 /// connection-level state across requests. Use this when you need to log in,
44 /// maintain a CSRF token, or interact with session-dependent APIs.
45 FetcherSession(FetcherSession),
46}
47
48/// Manages named sessions and dispatches fetch requests to the appropriate one.
49///
50/// The engine owns a single `SessionManager` and uses it for every fetch during
51/// the crawl. Sessions are identified by string IDs; one session is designated
52/// as the default and is used whenever a [`Request`](crate::request::Request)
53/// does not specify a `sid`.
54pub struct SessionManager {
55 sessions: HashMap<String, Session>,
56 default_session_id: Option<String>,
57}
58
59impl SessionManager {
60 /// Creates an empty session manager with no sessions registered. You must
61 /// call [`add`](SessionManager::add) at least once before the engine can
62 /// fetch anything; the engine will return an error if the manager is empty.
63 pub fn new() -> Self {
64 Self {
65 sessions: HashMap::new(),
66 default_session_id: None,
67 }
68 }
69
70 /// Registers a session under the given ID, optionally marking it as the
71 /// default. The first session added automatically becomes the default even
72 /// if `default` is `false`. Returns an error if a session with the same ID
73 /// already exists.
74 pub fn add(
75 &mut self,
76 session_id: impl Into<String>,
77 session: Session,
78 default: bool,
79 ) -> Result<()> {
80 let id = session_id.into();
81 if self.sessions.contains_key(&id) {
82 return Err(SpiderError::Session(format!(
83 "session '{id}' already exists"
84 )));
85 }
86 if default || self.default_session_id.is_none() {
87 self.default_session_id = Some(id.clone());
88 }
89 self.sessions.insert(id, session);
90 Ok(())
91 }
92
93 /// Returns the default session ID, or an error if none is set. The engine
94 /// calls this whenever a request has an empty `sid` field to determine which
95 /// session to use.
96 pub fn default_session_id(&self) -> Result<&str> {
97 self.default_session_id
98 .as_deref()
99 .ok_or_else(|| SpiderError::Session("no default session".into()))
100 }
101
102 /// Returns a list of all registered session IDs.
103 pub fn session_ids(&self) -> Vec<&str> {
104 self.sessions.keys().map(|s| s.as_str()).collect()
105 }
106
107 /// Returns a reference to the session with the given ID, or an error if not found.
108 pub fn get(&self, session_id: &str) -> Result<&Session> {
109 self.sessions.get(session_id).ok_or_else(|| {
110 SpiderError::Session(format!(
111 "session '{session_id}' not found; available: {:?}",
112 self.session_ids()
113 ))
114 })
115 }
116
117 /// Returns a mutable reference to the session with the given ID, or an error if not found.
118 pub fn get_mut(&mut self, session_id: &str) -> Result<&mut Session> {
119 let ids = self.session_ids().join(", ");
120 self.sessions.get_mut(session_id).ok_or_else(|| {
121 SpiderError::Session(format!(
122 "session '{session_id}' not found; available: [{ids}]"
123 ))
124 })
125 }
126
127 /// Fetches the URL from the request using the appropriate session.
128 ///
129 /// If the request has a non-empty `sid`, that session is used; otherwise
130 /// the default session is selected. The method dispatches to the correct
131 /// `get` implementation depending on whether the session is a [`Fetcher`]
132 /// or a [`FetcherSession`]. Returns the HTTP response on success, or a
133 /// [`SpiderError`](crate::error::SpiderError) on failure.
134 pub async fn fetch(&self, request: &Request) -> Result<Response> {
135 let sid = if request.sid.is_empty() {
136 self.default_session_id()?
137 } else {
138 &request.sid
139 };
140
141 let session = self.get(sid)?;
142
143 debug!(sid = sid, url = %request.url, "fetching via session");
144
145 let response = match session {
146 Session::Fetcher(f) => f.get(&request.url, None).await?,
147 Session::FetcherSession(fs) => fs.get(&request.url, None).await?,
148 };
149
150 Ok(response)
151 }
152
153 /// Returns the number of registered sessions.
154 pub fn len(&self) -> usize {
155 self.sessions.len()
156 }
157
158 /// Returns `true` if no sessions are registered.
159 pub fn is_empty(&self) -> bool {
160 self.sessions.is_empty()
161 }
162
163 /// Returns `true` if a session with the given ID exists.
164 pub fn contains(&self, session_id: &str) -> bool {
165 self.sessions.contains_key(session_id)
166 }
167}
168
169impl Default for SessionManager {
170 fn default() -> Self {
171 Self::new()
172 }
173}