Skip to main content

scrapling_spider/
session.rs

1//! Named HTTP session management for the crawler.
2//!
3//! The [`SessionManager`] lets a spider register multiple HTTP backends under
4//! string identifiers. Each [`Request`](crate::request::Request) carries an
5//! optional `sid` (session ID) that tells the engine which backend to use for
6//! that particular fetch. This is how you mix plain HTTP fetchers with
7//! cookie-persisting sessions, or route certain domains through different proxy
8//! configurations.
9//!
10//! A session is either a stateless [`Fetcher`] or a stateful [`FetcherSession`]
11//! (which automatically carries cookies and custom headers across requests).
12//! The spider configures its sessions in
13//! [`Spider::configure_sessions`](crate::spider::Spider::configure_sessions).
14//!
15//! ```rust,ignore
16//! use scrapling_spider::session::{Session, SessionManager};
17//! use scrapling_fetch::Fetcher;
18//!
19//! let mut manager = SessionManager::new();
20//! manager.add("default", Session::Fetcher(Fetcher::new()), true).unwrap();
21//! ```
22
23use std::collections::HashMap;
24
25use scrapling_fetch::{Fetcher, FetcherSession, Response};
26use tracing::debug;
27
28use crate::error::{Result, SpiderError};
29use crate::request::Request;
30
31/// A session backend that can be either a stateless fetcher or a stateful session.
32///
33/// The distinction matters for cookie handling: a [`Fetcher`] starts fresh on
34/// every request, while a [`FetcherSession`] preserves cookies, headers, and
35/// other state between requests. Choose the variant that matches your crawling
36/// needs.
37pub enum Session {
38    /// A stateless HTTP fetcher. Each request is independent -- no cookies or
39    /// headers carry over. This is the simplest option and works well for public
40    /// pages that do not require authentication.
41    Fetcher(Fetcher),
42    /// A stateful HTTP session that persists cookies, headers, and other
43    /// connection-level state across requests. Use this when you need to log in,
44    /// maintain a CSRF token, or interact with session-dependent APIs.
45    FetcherSession(FetcherSession),
46}
47
48/// Manages named sessions and dispatches fetch requests to the appropriate one.
49///
50/// The engine owns a single `SessionManager` and uses it for every fetch during
51/// the crawl. Sessions are identified by string IDs; one session is designated
52/// as the default and is used whenever a [`Request`](crate::request::Request)
53/// does not specify a `sid`.
54pub struct SessionManager {
55    sessions: HashMap<String, Session>,
56    default_session_id: Option<String>,
57}
58
59impl SessionManager {
60    /// Creates an empty session manager with no sessions registered. You must
61    /// call [`add`](SessionManager::add) at least once before the engine can
62    /// fetch anything; the engine will return an error if the manager is empty.
63    pub fn new() -> Self {
64        Self {
65            sessions: HashMap::new(),
66            default_session_id: None,
67        }
68    }
69
70    /// Registers a session under the given ID, optionally marking it as the
71    /// default. The first session added automatically becomes the default even
72    /// if `default` is `false`. Returns an error if a session with the same ID
73    /// already exists.
74    pub fn add(
75        &mut self,
76        session_id: impl Into<String>,
77        session: Session,
78        default: bool,
79    ) -> Result<()> {
80        let id = session_id.into();
81        if self.sessions.contains_key(&id) {
82            return Err(SpiderError::Session(format!(
83                "session '{id}' already exists"
84            )));
85        }
86        if default || self.default_session_id.is_none() {
87            self.default_session_id = Some(id.clone());
88        }
89        self.sessions.insert(id, session);
90        Ok(())
91    }
92
93    /// Returns the default session ID, or an error if none is set. The engine
94    /// calls this whenever a request has an empty `sid` field to determine which
95    /// session to use.
96    pub fn default_session_id(&self) -> Result<&str> {
97        self.default_session_id
98            .as_deref()
99            .ok_or_else(|| SpiderError::Session("no default session".into()))
100    }
101
102    /// Returns a list of all registered session IDs.
103    pub fn session_ids(&self) -> Vec<&str> {
104        self.sessions.keys().map(|s| s.as_str()).collect()
105    }
106
107    /// Returns a reference to the session with the given ID, or an error if not found.
108    pub fn get(&self, session_id: &str) -> Result<&Session> {
109        self.sessions.get(session_id).ok_or_else(|| {
110            SpiderError::Session(format!(
111                "session '{session_id}' not found; available: {:?}",
112                self.session_ids()
113            ))
114        })
115    }
116
117    /// Returns a mutable reference to the session with the given ID, or an error if not found.
118    pub fn get_mut(&mut self, session_id: &str) -> Result<&mut Session> {
119        let ids = self.session_ids().join(", ");
120        self.sessions.get_mut(session_id).ok_or_else(|| {
121            SpiderError::Session(format!(
122                "session '{session_id}' not found; available: [{ids}]"
123            ))
124        })
125    }
126
127    /// Fetches the URL from the request using the appropriate session.
128    ///
129    /// If the request has a non-empty `sid`, that session is used; otherwise
130    /// the default session is selected. The method dispatches to the correct
131    /// `get` implementation depending on whether the session is a [`Fetcher`]
132    /// or a [`FetcherSession`]. Returns the HTTP response on success, or a
133    /// [`SpiderError`](crate::error::SpiderError) on failure.
134    pub async fn fetch(&self, request: &Request) -> Result<Response> {
135        let sid = if request.sid.is_empty() {
136            self.default_session_id()?
137        } else {
138            &request.sid
139        };
140
141        let session = self.get(sid)?;
142
143        debug!(sid = sid, url = %request.url, "fetching via session");
144
145        let response = match session {
146            Session::Fetcher(f) => f.get(&request.url, None).await?,
147            Session::FetcherSession(fs) => fs.get(&request.url, None).await?,
148        };
149
150        Ok(response)
151    }
152
153    /// Returns the number of registered sessions.
154    pub fn len(&self) -> usize {
155        self.sessions.len()
156    }
157
158    /// Returns `true` if no sessions are registered.
159    pub fn is_empty(&self) -> bool {
160        self.sessions.is_empty()
161    }
162
163    /// Returns `true` if a session with the given ID exists.
164    pub fn contains(&self, session_id: &str) -> bool {
165        self.sessions.contains_key(session_id)
166    }
167}
168
169impl Default for SessionManager {
170    fn default() -> Self {
171        Self::new()
172    }
173}