use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use serde_json::{json, Value};
use tokio::sync::mpsc;
use tracing::{info, warn};
use crate::config::BrowserConfig;
use crate::tools::command_risk::{PermissionMode, RiskLevel};
use crate::tools::terminal::ApprovalRequest;
use crate::tools::ApprovalBroker;
use crate::traits::{
MessageAttachment, Tool, ToolCallMetadata, ToolCallOutcome, ToolCallSemantics,
ToolCapabilities, ToolTargetHintKind, ToolVerificationMode,
};
use crate::types::{ApprovalResponse, MediaKind, MediaMessage};
use policy::BrowserRiskClass;
mod backend;
mod diagnostics;
pub mod policy;
mod session;
#[cfg(all(test, feature = "browser"))]
mod smoke;
#[cfg(test)]
mod tests;
use backend::{BrowserBackend, ChromiumoxideBackend, PageHandle};
use diagnostics::BrowserDiagnosticsStore;
use session::{BrowserSessionRegistry, TabView};
use tokio::sync::OwnedMutexGuard;
/// Default time the user has to respond to a browser approval prompt before the
/// action is auto-denied (fail safe). Matches the terminal/config approval
/// window. Overridable in tests so the timeout path runs in milliseconds.
const DEFAULT_APPROVAL_TIMEOUT: Duration = Duration::from_secs(300);
/// Maximum allowed size of a `script` argument for `execute_js`. Generous for
/// legitimate automation (64 KiB covers virtually any real workflow) while
/// bounding potential abuse via enormous payloads.
const MAX_SCRIPT_BYTES: usize = 64 * 1024;
/// Patterns whose presence in a script argument means the script is attempting
/// to use a privileged browser-management API that bypasses the session/tab
/// model or the approval boundary. Scripts matching any of these are rejected
/// before evaluation.
///
/// Rationale for each entry:
/// - `window.open` — spawns tabs outside the BrowserTool session/tab model,
/// making them invisible to the registry and unaccountable to the caller.
/// - `chrome.` — the chrome.* namespace (chrome.debugger, chrome.management,
/// chrome.tabs, chrome.runtime, etc.) exposes privileged extension/DevTools
/// APIs that can detach the debugger, enumerate/modify tabs across all
/// sessions, or exfiltrate data via cross-context messaging. Any access to
/// this namespace is blocked.
const JS_DENYLIST: &[&str] = &["window.open", "chrome."];
/// Validate script constraints before the approval gate so that a doomed
/// script is never sent for user approval and never touches the backend.
///
/// Returns `Ok(())` when the script passes all checks, or `Err(reason)` with
/// a user-facing error message when a check fails. The reason MUST NOT echo
/// the script body — it names only the violated constraint.
fn validate_script_constraints(script: &str) -> Result<(), String> {
// 1. Size cap: reject scripts larger than MAX_SCRIPT_BYTES.
let byte_len = script.len();
if byte_len > MAX_SCRIPT_BYTES {
return Err(format!(
"Script too large: {} bytes (max {}). Split the work into smaller steps.",
byte_len, MAX_SCRIPT_BYTES
));
}
// 2. Browser-management API denylist: reject scripts referencing privileged
// browser-management APIs that bypass the session/tab model. Each pattern
// is specific enough (not a natural-language single word) that substring
// matching is appropriate per the project's keyword-matching guidelines.
for &pattern in JS_DENYLIST {
if script.contains(pattern) {
return Err(format!(
"Script uses a disallowed browser-management API ('{}' is not permitted). \
Use BrowserTool tab actions (new_tab, switch_tab, close_tab) for tab management.",
pattern
));
}
}
Ok(())
}
/// The condition the `wait` action polls for. `Present` is the default and
/// preserves the historical behavior (element exists in the DOM).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum WaitCondition {
/// Element matching the selector exists in the DOM (default).
Present,
/// Element is present AND laid out / not hidden via CSS.
Visible,
/// Element's `disabled` property is falsy.
Enabled,
/// Element is absent OR laid out as hidden.
Hidden,
/// Element's text contains the provided needle.
TextContains,
}
impl WaitCondition {
fn parse(s: &str) -> Result<Self, String> {
match s {
"present" => Ok(Self::Present),
"visible" => Ok(Self::Visible),
"enabled" => Ok(Self::Enabled),
"hidden" => Ok(Self::Hidden),
"text_contains" => Ok(Self::TextContains),
other => Err(format!(
"Invalid wait condition '{}'. Valid: present, visible, enabled, hidden, text_contains",
other
)),
}
}
fn success_message(self, selector: &str, needle: Option<&str>) -> String {
match self {
Self::Present => format!("Element '{}' found", selector),
Self::Visible => format!("Element '{}' is visible", selector),
Self::Enabled => format!("Element '{}' is enabled", selector),
Self::Hidden => format!("Element '{}' is hidden", selector),
Self::TextContains => format!(
"Element '{}' text contains the expected value ({} chars)",
selector,
needle.unwrap_or("").len()
),
}
}
fn timeout_message(self, selector: &str, needle: Option<&str>, secs: u64) -> String {
let what = match self {
Self::Present => format!("element '{}' not found", selector),
Self::Visible => format!("element '{}' not visible", selector),
Self::Enabled => format!("element '{}' not enabled", selector),
Self::Hidden => format!("element '{}' still visible", selector),
Self::TextContains => format!(
"element '{}' text did not contain the expected value ({} chars)",
selector,
needle.unwrap_or("").len()
),
};
format!("Timeout: {} after {}s", what, secs)
}
}
/// Decision returned by the approval gate. `Allow` lets the action reach the
/// backend; `Deny` blocks it with a user-facing reason BEFORE any page/backend
/// method is touched.
enum GateDecision {
Allow,
Deny(String),
}
/// The parsed, approval-relevant fields of a single browser tool call. Bundled
/// so the gate and prompt builder take one borrow instead of many positional
/// args. The `value` (fill text) is deliberately NOT carried here — it must
/// never reach the prompt.
struct ActionArgs<'a> {
action: &'a str,
url: Option<&'a str>,
selector: Option<&'a str>,
script: Option<&'a str>,
tab_id: Option<&'a str>,
session_id: &'a str,
}
/// Reduce a full URL to its origin (`scheme://host[:port]`), dropping any
/// userinfo, path, query, and fragment — all of which can carry secrets
/// (credentials, session tokens, reset codes) and must never be surfaced in a
/// tab listing.
///
/// Parsing is deliberately dependency-free string surgery:
/// - For a `scheme://...` URL, take the authority (everything before the first
/// `/`, `?`, or `#`), then drop any `userinfo@` prefix, keeping only the
/// `host[:port]`. So `https://user:pass@host/p?x=secret` → `https://host`.
/// - For inputs without a `scheme://authority` form (e.g. `about:blank`,
/// `data:`, or a schemeless `host.com/path?x=secret`), cut at the first `/`,
/// `?`, or `#`, stripping any path/query/fragment so none of it can leak.
pub(super) fn redact_origin(url: &str) -> String {
let url = url.trim();
if url.is_empty() {
return String::new();
}
// Find the scheme separator.
if let Some(scheme_end) = url.find("://") {
let after_scheme = scheme_end + 3;
let authority_and_rest = &url[after_scheme..];
// Authority ends at the first '/', '?', or '#'.
let authority_len = authority_and_rest
.find(['/', '?', '#'])
.unwrap_or(authority_and_rest.len());
let authority = &authority_and_rest[..authority_len];
// Drop any `userinfo@` prefix so embedded credentials never survive:
// keep only the host[:port] after the LAST '@'.
let host = match authority.rfind('@') {
Some(at) => &authority[at + 1..],
None => authority,
};
return format!("{}://{}", &url[..scheme_end], host);
}
// No scheme://authority form (e.g. about:blank, data:, mailto:, or a
// schemeless host/path). Strip any path/query/fragment by cutting at the
// first '/', '?', or '#' so no path can leak.
let cut = url.find(['/', '?', '#']).unwrap_or(url.len());
url[..cut].to_string()
}
/// Redact a URL for DISPLAY in a screenshot caption / action result: keep the
/// scheme, host, and PATH (useful context for the user) but strip the query
/// string and fragment, which can carry session tokens, auth codes, or other
/// secrets. Anything from the first `?` or `#` onward is dropped.
///
/// Examples:
/// - `https://host.com/a/b?token=SECRET#frag` → `https://host.com/a/b`
/// - `https://host.com/path` → `https://host.com/path`
/// - `about:blank` → `about:blank`
fn redact_url_for_display(url: &str) -> String {
let url = url.trim();
let cut = url.find(['?', '#']).unwrap_or(url.len());
url[..cut].to_string()
}
/// User-facing description of a browser action for approval prompts.
/// Never includes fill values or script bodies.
fn format_browser_approval_prompt(
action: &str,
origin: &str,
selector: Option<&str>,
tab_id: Option<&str>,
script_len: Option<usize>,
_risk: &policy::BrowserActionRisk,
) -> String {
match action {
"list_tabs" => "List open browser tabs".to_string(),
"close" => "Close browser".to_string(),
"close_tab" => {
format!("Close browser tab {}", tab_id.unwrap_or("?"))
}
"navigate" => format!("Open website: {origin}"),
"new_tab" => format!("Open new tab: {origin}"),
"switch_tab" => {
format!("Switch to browser tab {}", tab_id.unwrap_or("?"))
}
"execute_js" => {
let bytes = script_len.unwrap_or(0);
format!("Run JavaScript on {origin} ({bytes} bytes)")
}
"click" => {
if let Some(sel) = selector {
return format!("Click \"{sel}\" on {origin}");
}
format!("Click on {origin}")
}
"fill" => {
if let Some(sel) = selector {
return format!("Fill in \"{sel}\" on {origin}");
}
format!("Fill in a form field on {origin}")
}
"get_text" => format!("Read page text from {origin}"),
"screenshot" => format!("Take screenshot of {origin}"),
"wait" => format!("Wait on {origin}"),
"set_mode" => format!("Change browser mode on {origin}"),
_ => format!("Browser action on {origin}"),
}
}
/// Telegram-safe upper bounds for an outbound screenshot, enforced BEFORE the
/// image is enqueued onto the media channel. A VIEWPORT capture at the default
/// window size is always within these; a `full_page` capture of a tall page can
/// exceed them, in which case we return a clear error instead of enqueuing an
/// image that the channel would silently reject (the production bug:
/// `PHOTO_INVALID_DIMENSIONS`, dropped with only a `warn!`).
///
/// Telegram rejects `sendPhoto` when width+height > 10000, when the aspect ratio
/// exceeds ~20:1, or when the file is larger than 10MB. We stay safely under each.
const MAX_SCREENSHOT_DIM_SUM: u32 = 9000;
const MAX_SCREENSHOT_RATIO: u32 = 18;
const MAX_SCREENSHOT_BYTES: usize = 9_000_000;
/// Upper bound on a screenshot delivered as a DOCUMENT (Telegram `sendDocument`
/// accepts large PNGs — ~50MB, no pixel-dimension limit — where `sendPhoto`
/// rejects them). Kept safely under Telegram's 50MB bot limit. A capture over
/// the photo caps but under this is sent as a file; over this it is refused.
const MAX_SCREENSHOT_DOCUMENT_BYTES: usize = 49 * 1024 * 1024;
/// How an oversized-vs-in-cap screenshot should be delivered, decided purely by
/// encoded byte length + whether it is within the photo caps. Pulled out as a
/// pure function so all three branches are unit-testable without allocating a
/// real 49MB buffer.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ScreenshotDelivery {
/// Within the photo caps → deliver inline as a `MediaKind::Photo`.
Photo,
/// Over the photo caps but within `MAX_SCREENSHOT_DOCUMENT_BYTES` → deliver
/// as a `MediaKind::Document` (file attachment).
Document,
/// Over `MAX_SCREENSHOT_DOCUMENT_BYTES` → cannot be delivered at all.
TooLarge,
}
/// Decide how to deliver a screenshot from its encoded byte length and whether
/// it exceeded the photo caps (`oversize_reason`: `None` = within photo caps,
/// `Some(_)` = over them — mirrors [`screenshot_oversize_reason`]'s output).
fn screenshot_delivery_kind(
byte_len: usize,
oversize_reason: Option<String>,
) -> ScreenshotDelivery {
match oversize_reason {
None => ScreenshotDelivery::Photo,
Some(_) if byte_len <= MAX_SCREENSHOT_DOCUMENT_BYTES => ScreenshotDelivery::Document,
Some(_) => ScreenshotDelivery::TooLarge,
}
}
/// Parse the pixel dimensions of a PNG from its header WITHOUT decoding the
/// image (no `image` crate dependency). A PNG is the 8-byte signature followed
/// by the IHDR chunk; width is the big-endian `u32` at bytes `[16..20]` and
/// height at `[20..24]`. Returns `None` if the buffer is too short or does not
/// begin with the PNG signature.
fn png_dimensions(bytes: &[u8]) -> Option<(u32, u32)> {
const PNG_SIGNATURE: [u8; 8] = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a];
if bytes.len() < 24 || bytes[..8] != PNG_SIGNATURE {
return None;
}
let width = u32::from_be_bytes([bytes[16], bytes[17], bytes[18], bytes[19]]);
let height = u32::from_be_bytes([bytes[20], bytes[21], bytes[22], bytes[23]]);
Some((width, height))
}
/// Whether a captured screenshot is within the channel-safe caps. Returns the
/// offending `(width, height)` when the PNG header is parseable AND the image is
/// too large by dimension-sum, aspect ratio, or encoded byte length. A buffer
/// whose PNG header is unparseable is judged on byte length alone (we can't know
/// its dimensions, but a too-large encoding is still rejected).
fn screenshot_oversize_reason(bytes: &[u8]) -> Option<String> {
if bytes.len() > MAX_SCREENSHOT_BYTES {
return Some(format!(
"encoded size {} bytes exceeds the {} byte limit",
bytes.len(),
MAX_SCREENSHOT_BYTES
));
}
if let Some((w, h)) = png_dimensions(bytes) {
if w.saturating_add(h) > MAX_SCREENSHOT_DIM_SUM {
return Some(format!("dimensions {}x{}", w, h));
}
let (lo, hi) = if w >= h { (h, w) } else { (w, h) };
if lo > 0 && hi / lo > MAX_SCREENSHOT_RATIO {
return Some(format!("aspect ratio of {}x{}", w, h));
}
}
None
}
/// Result of a browser action dispatch — text for the LLM plus optional vision attachments.
struct DispatchResult {
text: String,
attachments: Vec<MessageAttachment>,
}
impl DispatchResult {
fn text_only(text: String) -> Self {
Self {
text,
attachments: Vec::new(),
}
}
}
pub struct BrowserTool {
backend: Arc<dyn BrowserBackend>,
media_tx: mpsc::Sender<MediaMessage>,
/// Shared inbox for persisting screenshots for agent vision context.
inbox_dir: PathBuf,
/// Per-session page state, keyed by trusted internal `_session_id`.
sessions: BrowserSessionRegistry,
/// Transport used to prompt the user Allow/Deny at action time.
///
/// `None` means no approval channel is wired (only possible in tests via
/// `with_backend`); in that case every action that would require approval
/// fails safe to Deny without touching the backend.
approval_tx: Option<ApprovalBroker>,
/// How long to wait for an approval response before auto-denying.
approval_timeout: Duration,
/// Bounded navigation/DOM-ready timeout (resolved + clamped from config).
/// Used by `action_navigate` after `goto` and by the `action_click`
/// nav-race when a click triggers a navigation.
nav_timeout: Duration,
/// Default `wait`-action element-poll timeout (resolved + clamped). A
/// per-call `timeout_secs` arg overrides this (also clamped to the same
/// bound).
element_timeout: Duration,
/// Overall per-action ceiling for the click nav-race. The short stable-DOM
/// fallback for a NON-navigating click is a small fraction of this so a
/// click that doesn't navigate returns fast.
action_timeout: Duration,
/// Console logs and network load failures, scoped per session/tab.
diagnostics: BrowserDiagnosticsStore,
}
/// Upper bound on the element-poll timeout, mirroring `BrowserConfig`'s
/// `element_timeout` clamp, applied to a per-call `timeout_secs` override.
const MAX_ELEMENT_TIMEOUT_SECS: u64 = 120;
/// Interval between element-state polls in the `wait` action. With a paused
/// clock, tests advance by this step to drive the loop deterministically.
const WAIT_POLL_INTERVAL: Duration = Duration::from_millis(250);
/// Short stable-DOM settle used by the click nav-race for a NON-navigating
/// click: long enough for click-side JS to run, short enough that a plain click
/// returns fast. Bounded well under `action_timeout`.
const CLICK_SETTLE: Duration = Duration::from_millis(300);
impl BrowserTool {
/// Construct the browser tool, resolving and validating the session
/// isolation mode up front.
///
/// Returns an `Err` (surfaced at startup) when the configuration would
/// falsely claim per-session cookie isolation — e.g. `browser_context`
/// mode requested alongside a shared persistent profile or remote-debugging
/// Chrome. On success, logs the resolved mode and whether sessions SHARE
/// cookies, without logging any profile path contents.
pub fn new(
config: BrowserConfig,
media_tx: mpsc::Sender<MediaMessage>,
approval_tx: ApprovalBroker,
inbox_dir: impl Into<PathBuf>,
) -> Result<Self, String> {
// Resolve + clamp the bounded timeouts BEFORE `config` is moved into the
// backend.
let nav_timeout = config.nav_timeout();
let element_timeout = config.element_timeout();
let action_timeout = config.action_timeout();
let backend = ChromiumoxideBackend::new(config)?;
let mode = backend.session_isolation();
let (mode_label, shares_cookies) = match mode {
crate::config::SessionIsolation::Page => ("page", true),
crate::config::SessionIsolation::BrowserContext => ("browser_context", false),
};
info!(
isolation = mode_label,
shares_cookies,
"browser sessions share cookies: {shares_cookies} (isolation={mode_label})"
);
Ok(Self {
backend: Arc::new(backend),
media_tx,
inbox_dir: inbox_dir.into(),
sessions: BrowserSessionRegistry::new(),
approval_tx: Some(approval_tx),
approval_timeout: DEFAULT_APPROVAL_TIMEOUT,
nav_timeout,
element_timeout,
action_timeout,
diagnostics: BrowserDiagnosticsStore::new(),
})
}
/// Test-only constructor that injects an arbitrary backend (e.g. the mock)
/// with NO approval channel — exercises the missing-channel (fail-safe Deny)
/// path for actions that require approval.
#[cfg(test)]
pub fn with_backend(
backend: Arc<dyn BrowserBackend>,
media_tx: mpsc::Sender<MediaMessage>,
) -> Self {
Self {
backend,
media_tx,
inbox_dir: std::env::temp_dir().join("aidaemon-browser-test-inbox"),
sessions: BrowserSessionRegistry::new(),
approval_tx: None,
approval_timeout: DEFAULT_APPROVAL_TIMEOUT,
nav_timeout: Duration::from_secs(30),
element_timeout: Duration::from_secs(10),
action_timeout: Duration::from_secs(30),
diagnostics: BrowserDiagnosticsStore::new(),
}
}
/// Test-only constructor that injects a backend AND an approval broker plus a
/// (short) approval timeout, so approval allow/deny/timeout paths are
/// exercisable in milliseconds.
#[cfg(test)]
pub fn with_backend_and_approval(
backend: Arc<dyn BrowserBackend>,
media_tx: mpsc::Sender<MediaMessage>,
approval_tx: ApprovalBroker,
approval_timeout: Duration,
) -> Self {
Self {
backend,
media_tx,
inbox_dir: std::env::temp_dir().join("aidaemon-browser-test-inbox"),
sessions: BrowserSessionRegistry::new(),
approval_tx: Some(approval_tx),
approval_timeout,
nav_timeout: Duration::from_secs(30),
element_timeout: Duration::from_secs(10),
action_timeout: Duration::from_secs(30),
diagnostics: BrowserDiagnosticsStore::new(),
}
}
/// Test-only: override the resolved navigation/element/action timeouts so
/// the bounded-wait paths run under a paused fake clock without depending on
/// the production defaults.
#[cfg(test)]
pub fn with_timeouts(mut self, nav: Duration, element: Duration, action: Duration) -> Self {
self.nav_timeout = nav;
self.element_timeout = element;
self.action_timeout = action;
self
}
/// Read the active tab's last-known URL (cached in the session registry)
/// WITHOUT touching the backend, so the approval prompt can show a redacted
/// origin before any page method runs. Returns `None` if the session has no
/// active tab yet or its URL is unknown.
async fn active_origin_for_prompt(&self, session_id: &str) -> Option<String> {
let tabs = self.sessions.list_tabs(session_id).await;
let active = tabs.iter().find(|t| t.active).or_else(|| tabs.first())?;
active
.url
.as_deref()
.map(redact_origin)
.filter(|o| !o.is_empty())
}
/// Build the secret-safe approval prompt string for an action.
///
/// NEVER include: the `fill` value, the full `execute_js` script (only
/// "JavaScript execution" + byte length), or full URLs with path/query/
/// fragment (origins are redacted via [`redact_origin`]).
async fn build_prompt(
&self,
args: &ActionArgs<'_>,
risk: &policy::BrowserActionRisk,
) -> String {
// Origin: for url-bearing actions use the url arg; otherwise the active
// tab's last-known origin (or "current page" when unknown).
let origin = match args.url {
Some(u) => {
let r = redact_origin(u);
if r.is_empty() {
"current page".to_string()
} else {
r
}
}
None => self
.active_origin_for_prompt(args.session_id)
.await
.unwrap_or_else(|| "current page".to_string()),
};
format_browser_approval_prompt(
args.action,
&origin,
args.selector,
args.tab_id,
args.script.map(|s| s.len()),
risk,
)
}
/// Send an approval request and await the user's decision, failing safe to
/// `Deny` on a closed channel or timeout. Returns `None` only when no
/// approval channel is wired (the caller treats that as a fail-safe Deny).
async fn request_approval(
&self,
command: String,
risk_level: RiskLevel,
warnings: Vec<String>,
session_id: &str,
) -> Option<ApprovalResponse> {
let broker = self.approval_tx.as_ref()?;
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
if broker
.send(ApprovalRequest {
command,
session_id: session_id.to_string(),
risk_level,
warnings,
permission_mode: PermissionMode::Default,
response_tx,
kind: Default::default(),
})
.await
.is_err()
{
warn!("browser approval channel closed; denying action");
return Some(ApprovalResponse::Deny);
}
match tokio::time::timeout(self.approval_timeout, response_rx).await {
Ok(Ok(resp)) => Some(resp),
Ok(Err(_)) => {
warn!("browser approval response channel closed; denying action");
Some(ApprovalResponse::Deny)
}
Err(_) => {
warn!("browser approval request timed out; denying action");
Some(ApprovalResponse::Deny)
}
}
}
/// The approval gate. Runs BEFORE any backend/page method is touched, so a
/// denied action can never reach the browser. Returns [`GateDecision::Allow`]
/// only when the action is permitted.
///
/// Per-class rules:
/// - `Observation` (get_text/screenshot/wait/list_tabs): never prompt.
/// - `Administrative` (close/close_tab/set_mode): never prompt — local
/// lifecycle / mode switch, not a consequential web side effect.
/// - `sensitive || consequential` (every `execute_js`, plus consequential
/// click/fill): point-of-action — ALWAYS prompt, every call, regardless of
/// any prior session approval. A non-Deny response allows ONLY this single
/// action and NEVER records persistent/session approval.
/// - `Navigation` / ordinary `Mutation`: session-level. Allowed without a
/// prompt once the session is approved; otherwise prompt. `AllowOnce`
/// allows just this action; `AllowSession`/`AllowAlways` also mark the
/// session approved so subsequent ordinary actions don't re-prompt.
/// - Missing approval channel + an action that needs approval → fail safe to
/// Deny (observations/administrative still run).
async fn approval_gate(&self, args: &ActionArgs<'_>) -> GateDecision {
let action = args.action;
let session_id = args.session_id;
let risk = policy::classify(action, args.selector, args.script);
// Free actions: never prompt.
if matches!(
risk.class,
BrowserRiskClass::Observation | BrowserRiskClass::Administrative
) {
return GateDecision::Allow;
}
let point_of_action = risk.sensitive || risk.consequential;
// Session-level fast path: an already-approved session skips the prompt
// for ordinary navigation/mutation — but NEVER for point-of-action.
if !point_of_action && self.sessions.is_session_approved(session_id).await {
return GateDecision::Allow;
}
// From here we need to prompt. If no channel is wired, fail safe.
if self.approval_tx.is_none() {
warn!(
action,
"browser action requires approval but no approval channel is wired; denying"
);
return GateDecision::Deny(
"Approval required, but no approval channel is available. Action denied."
.to_string(),
);
}
let risk_level = if point_of_action {
RiskLevel::High
} else {
RiskLevel::Medium
};
let mut warnings = Vec::new();
if risk.sensitive {
warnings.push("This can read or access private data on the page.".to_string());
}
if risk.consequential {
warnings.push(
"This may submit forms, make purchases, delete data, or send messages.".to_string(),
);
}
let command = self.build_prompt(args, &risk).await;
let resp = self
.request_approval(command, risk_level, warnings, session_id)
.await;
match resp {
// No channel: already handled above, but keep the fail-safe.
None => GateDecision::Deny(
"Approval required, but no approval channel is available. Action denied."
.to_string(),
),
Some(ApprovalResponse::Deny) => GateDecision::Deny("Denied by user.".to_string()),
Some(ApprovalResponse::AllowOnce) => GateDecision::Allow,
Some(ApprovalResponse::AllowSession) | Some(ApprovalResponse::AllowAlways) => {
// Point-of-action approvals NEVER persist: each consequential
// action / execute_js must be approved on its own. Only ordinary
// navigation/mutation marks the session approved.
if !point_of_action {
self.sessions.mark_session_approved(session_id).await;
}
GateDecision::Allow
}
}
}
/// Resolve this session's page and acquire its action lock, held for the
/// WHOLE action via the returned owned guard.
///
/// The flow is: `ensure_ready()` (global browser launch) → resolve/create
/// the session's page via the registry → take the per-session action lock.
/// The action lock serializes a single session's own calls while letting
/// DIFFERENT sessions proceed concurrently — it is NOT the global browser
/// mutex, so distinct sessions do not serialize on each other.
async fn page_for(
&self,
session_id: &str,
) -> Result<(Arc<dyn PageHandle>, OwnedMutexGuard<()>), String> {
// Reject empty session id BEFORE launching the browser.
if session_id.is_empty() {
return Err("browser actions require a session id".to_string());
}
self.backend.ensure_ready().await?;
let (page, action_lock) = self
.sessions
.get_or_create_page(session_id, &*self.backend)
.await?;
if let Some(tab_id) = self.sessions.active_target_id(session_id).await {
self.diagnostics
.ensure_listeners(&page, session_id, &tab_id)
.await;
}
let guard = action_lock.lock_owned().await;
Ok((page, guard))
}
/// Defense-in-depth gate for observation/JS actions: read the page's LIVE
/// committed URL and re-validate it against the shared private-network policy
/// BEFORE reading/capturing/evaluating any page content.
///
/// Per-request subresource interception is deferred (see the `PageHandle::url`
/// doc + the `#[ignore]`d feasibility stub), so a page can still reach a
/// blocked host AFTER load via a meta-refresh, JS-driven `location` change, or
/// nested frame. The final-URL revalidation in `action_navigate`/`action_new_tab`
/// only catches redirects at navigation time — it cannot catch a post-load
/// redirect. This helper closes that gap for the exfiltration vectors named in
/// the finding: by re-checking the live URL right before each observation/JS
/// action, a post-load redirect to a private host cannot be read out,
/// screenshotted, or evaluated.
///
/// On block, returns a structured host-CLASS error only — never the URL,
/// path, query, or any embedded credentials.
async fn ensure_current_url_allowed(&self, page: &Arc<dyn PageHandle>) -> Result<(), String> {
if let Some(current_url) = page.url().await {
if let Err(blocked) = policy::validate_network_url(¤t_url) {
warn!(
class = blocked.class.label(),
"observation/JS action refused: current page is on a blocked host"
);
return Err(format!(
"Action blocked: current page is a {}",
blocked.class.label()
));
}
}
Ok(())
}
async fn action_navigate(&self, args: &Value, session_id: &str) -> Result<String, String> {
let url = args
.get("url")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: url".to_string())?;
// Pre-flight SSRF check on the requested URL. The error names ONLY the
// host class (loopback/private/link-local) — never the URL, path,
// query, or credentials — via the shared policy seam.
if let Err(blocked) = policy::validate_network_url(url) {
return Err(blocked.message());
}
let (page, _guard) = self.page_for(session_id).await?;
page.goto(url).await?;
// Wait for the navigation lifecycle / DOM-ready signal instead of a
// blind fixed sleep, bounded by `nav_timeout`. A page that never fires
// `load` does NOT hard-fail navigate — `wait_for_navigation` returns
// best-effort on its internal timeout. A genuine connection-class error
// here propagates so `dispatch_with_recovery` can classify it.
if let Err(e) = page.wait_for_navigation(self.nav_timeout).await {
// If the wait itself failed for a connection reason, the recovery
// wrapper handles it; surface it. The local timeout never reaches
// here (it returns Ok). See the connection-reset note below.
if backend::is_connection_error(&e) {
return Err(e);
}
warn!(error = %e, "navigation wait reported a non-connection error; proceeding to URL revalidation");
}
// Revalidate the FINAL committed URL. A server-side redirect can land on
// a blocked host (e.g. a public redirector → http://127.0.0.1/...) even
// though the requested URL was public and per-request subresource
// interception is deferred (see the Task 8 report / CDP feasibility
// note). If the committed URL is blocked, treat the navigation as
// blocked and surface ONLY the host class — never the committed URL,
// which may carry a path/query/token.
if let Some(final_url) = page.url().await {
if let Err(blocked) = policy::validate_network_url(&final_url) {
warn!(
class = blocked.class.label(),
"navigation landed on a blocked host after redirect; blocking"
);
// Neutralize the committed state: the page is currently sitting on
// the blocked host, so a subsequent get_text/screenshot/execute_js
// could read/capture/evaluate the blocked content even though we're
// about to return an error. Reset to about:blank so nothing on the
// blocked host remains observable. Best-effort: a failure here does
// not change the outcome (the action is still blocked).
let _ = page.goto("about:blank").await;
return Err(format!(
"Navigation blocked: redirected to a {}",
blocked.class.label()
));
}
}
Ok(format!("Navigated to {}", url))
}
async fn action_screenshot(
&self,
args: &Value,
session_id: &str,
) -> Result<DispatchResult, String> {
// `page_for` already rejects an empty session id before any capture; we
// additionally guard the media-delivery path below so an empty id can
// never reach the channel.
let (page, _guard) = self.page_for(session_id).await?;
// Defense-in-depth: refuse to capture if the live committed URL is a
// blocked host (e.g. reached via post-load JS-redirect/meta-refresh).
self.ensure_current_url_allowed(&page).await?;
let selector = args.get("selector").and_then(|v| v.as_str());
// Default to a VIEWPORT capture; full-page must be opted into explicitly.
// A selector capture ignores full_page (the element bounds define it).
let full_page = args
.get("full_page")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let png_bytes = page.screenshot(selector, full_page).await?;
// Redacted page URL (query + fragment stripped) for the caption AND any
// URL echoed back in the result string — neither must leak a token.
let display_url = page
.url()
.await
.map(|u| redact_url_for_display(&u))
.unwrap_or_else(|| "current page".to_string());
let caption = format!("Screenshot of {}", display_url);
// Guard the delivery path: never enqueue media with an empty session id.
if session_id.is_empty() {
return Err("browser actions require a session id".to_string());
}
let saved_attachment = crate::channels::attachments::save_tool_observation_image(
&self.inbox_dir,
&png_bytes,
"screenshot.png",
"image/png",
"browser",
)
.map_err(|e| format!("Screenshot captured but failed to save for vision context: {e}"))?;
// Decide HOW to deliver based on size. A viewport capture is always a
// Photo. A full-page capture of a long page can exceed Telegram's
// sendPhoto caps (PHOTO_INVALID_DIMENSIONS); rather than refusing it, we
// fall back to sendDocument, which accepts large PNGs (~50MB, no pixel
// limit) — the full page actually arrives as a viewable image file. Only
// a capture larger than even the document cap is refused.
let oversize_reason = screenshot_oversize_reason(&png_bytes);
let delivery = screenshot_delivery_kind(png_bytes.len(), oversize_reason);
// `as_file` selects the honest, mode-aware success string below.
let (kind, as_file) = match delivery {
ScreenshotDelivery::Photo => (MediaKind::Photo { data: png_bytes }, false),
ScreenshotDelivery::Document => (
MediaKind::Document {
file_path: saved_attachment.local_path.clone(),
filename: saved_attachment.filename.clone(),
},
true,
),
ScreenshotDelivery::TooLarge => {
return Err(format!(
"Screenshot is too large to deliver even as a file ({} bytes, max {}). \
Capture a specific element with a selector.",
png_bytes.len(),
MAX_SCREENSHOT_DOCUMENT_BYTES
));
}
};
// Honest delivery: ask the media listener to report the ACTUAL outcome.
let (result_tx, result_rx) = tokio::sync::oneshot::channel::<Result<(), String>>();
self.media_tx
.send(MediaMessage {
session_id: session_id.to_string(),
caption: caption.clone(),
kind,
result_tx: Some(result_tx),
})
.await
.map_err(|e| format!("Failed to send screenshot to chat: {}", e))?;
// Wait (bounded) for the listener to confirm delivery, then report
// HONESTLY — never claim "sent" unless the channel actually accepted it.
let text = match tokio::time::timeout(Duration::from_secs(30), result_rx).await {
Ok(Ok(Ok(()))) => {
let base = if as_file {
format!(
"Screenshot captured and delivered to chat as a file (the full page was \
too large for an inline image). {}",
caption
)
} else {
format!("Screenshot captured and delivered to chat. {}", caption)
};
format!(
"{base}\nSaved to: {}",
saved_attachment.local_path
)
}
Ok(Ok(Err(reason))) => {
return Err(format!(
"Screenshot captured but could NOT be delivered to chat: {}. The image was not sent.",
reason
))
}
Ok(Err(_)) => {
return Err(
"Screenshot captured but delivery could not be confirmed (the delivery channel \
was dropped). The image may not have been sent."
.to_string(),
)
}
Err(_) => {
return Err(
"Screenshot captured but delivery could not be confirmed within the timeout. \
The image may not have been sent."
.to_string(),
)
}
};
Ok(DispatchResult {
text,
attachments: vec![saved_attachment],
})
}
async fn action_click(&self, args: &Value, session_id: &str) -> Result<String, String> {
let selector = args
.get("selector")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: selector".to_string())?;
// Resolve the session's (active) page FIRST so that creating a session's
// first page does not itself look like a popup. Then snapshot the
// browser's live targets BEFORE the click so we can detect a popup
// (target=_blank / window.open) the click may spawn.
let (page, _guard) = self.page_for(session_id).await?;
// The clicking session's active target id — the ONLY legitimate opener
// for a popup we should attribute to this session.
let clicker_target_id = self.sessions.active_target_id(session_id).await;
let known_before: Vec<String> = self
.backend
.list_targets()
.await
.map(|ts| ts.into_iter().map(|t| t.target_id).collect())
.unwrap_or_default();
page.click(selector).await?;
// Nav-race (replaces the old fixed 500ms sleep): race a navigation
// signal against a short stable-DOM settle. A click that triggers a
// navigation waits for that navigation (bounded by `nav_timeout`, itself
// capped under `action_timeout`); a click that does NOT navigate returns
// quickly after `CLICK_SETTLE`. `wait_for_navigation` resolves only when
// a navigation actually completes, so for a non-navigating click the
// settle timer wins and we return fast.
let nav_budget = self.nav_timeout.min(self.action_timeout);
tokio::select! {
biased;
_ = tokio::time::sleep(CLICK_SETTLE) => {
// Non-navigating (or fast) click: settle elapsed first. Return fast.
}
nav = page.wait_for_navigation(nav_budget) => {
// A navigation completed (or its bounded wait returned). Surface a
// connection-class error so recovery can classify it; otherwise
// proceed to popup detection.
if let Err(e) = nav {
if backend::is_connection_error(&e) {
return Err(e);
}
warn!(error = %e, "click navigation wait reported a non-connection error");
}
}
}
// Popup detection: diff the live targets against what the session knew
// before. A brand-new target is registered as a tab in this session
// ONLY when its CDP `openerId` is this session's active page — so a
// target=_blank click never silently leaves later actions stranded on
// the old implicit page, yet a tab opened by a DIFFERENT session (or
// independently) is never misattributed to us. The new tab is NOT
// auto-activated; the current tab stays active unless the caller
// explicitly switches.
let new_tab_id = self
.detect_and_register_popup(session_id, &known_before, clicker_target_id.as_deref())
.await;
match new_tab_id {
Some(tab_id) => Ok(format!(
"Clicked element '{}' (opened new tab: {})",
selector, tab_id
)),
None => Ok(format!("Clicked element '{}'", selector)),
}
}
/// After an action that may spawn a popup, diff the browser's live targets
/// against `known_before`. Register the FIRST net-new target whose CDP
/// `openerId` equals `clicker_target_id` (the clicking session's active
/// page) as a tab in the session (not active) and return its opaque tab id.
///
/// A net-new target with a DIFFERENT opener — or no opener — is NOT
/// attributed to this session: under concurrent timing it belongs to
/// another session or was opened independently, and binding it here would be
/// a cross-session info leak (the clicker could then switch/read its page).
/// Returns `None` when no eligible new target appeared, when this session
/// has no resolvable active target, or when the diff couldn't be computed.
async fn detect_and_register_popup(
&self,
session_id: &str,
known_before: &[String],
clicker_target_id: Option<&str>,
) -> Option<String> {
// Without a known active target for the clicker, we cannot prove a
// popup's opener belongs to this session — refuse to attribute anything.
let clicker_target_id = clicker_target_id?;
let targets = self.backend.list_targets().await.ok()?;
for t in targets {
if known_before.iter().any(|k| k == &t.target_id) {
continue;
}
// Only attribute a net-new target whose opener is THIS session's
// active page. Any other opener (a different session's tab) or no
// opener at all is rejected — never bound into this session.
if t.opener_id.as_deref() != Some(clicker_target_id) {
continue;
}
// The popup is ours. Bind a page handle to it so the session can
// operate on it later, then register it.
let page = self.backend.page_for_target(&t.target_id).await.ok()?;
let registered = self
.sessions
.add_tab(
session_id,
&t.target_id,
page,
t.url.clone(),
t.title.clone(),
// A popup inherits its opener's browser context (which the
// session already tracks on its opener tab), so we record no
// additional context id here — avoids a double-dispose of the
// same context on eviction.
/* context_id */
None,
/* make_active */ false,
)
.await;
if let Some(id) = registered {
return Some(id);
}
}
None
}
async fn action_fill(&self, args: &Value, session_id: &str) -> Result<String, String> {
let selector = args
.get("selector")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: selector".to_string())?;
let value = args
.get("value")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: value".to_string())?;
let (page, _guard) = self.page_for(session_id).await?;
page.replace_text(selector, value).await?;
tracing::info!(
action = "fill",
selector,
value_bytes = value.len(),
"browser fill"
);
Ok(format!("Filled '{}'", selector))
}
async fn action_get_text(&self, args: &Value, session_id: &str) -> Result<String, String> {
let (page, _guard) = self.page_for(session_id).await?;
// Defense-in-depth: refuse to read if the live committed URL is a blocked
// host (e.g. reached via post-load JS-redirect/meta-refresh).
self.ensure_current_url_allowed(&page).await?;
let text = if let Some(selector) = args.get("selector").and_then(|v| v.as_str()) {
page.inner_text(selector).await?
} else {
page.body_text().await?
};
// Truncate if very long.
let text = crate::utils::truncate_with_note(&text, 4000);
// Apply secret redaction AFTER truncation. DOM content can contain
// tokens, API keys, or bearer tokens embedded in the page — these must
// not reach the user or event persistence in their raw form.
let text = crate::tools::sanitize::redact_secrets(&text);
Ok(text)
}
async fn action_scroll(&self, args: &Value, session_id: &str) -> Result<String, String> {
let direction = args
.get("direction")
.and_then(|value| value.as_str())
.unwrap_or("down");
let amount = args
.get("amount")
.and_then(|value| value.as_i64())
.unwrap_or(700);
if !(1..=5000).contains(&amount) {
return Err("Parameter 'amount' must be between 1 and 5000 pixels".to_string());
}
let delta_y = match direction {
"down" => amount,
"up" => -amount,
_ => return Err("Parameter 'direction' must be 'up' or 'down'".to_string()),
};
let (page, _guard) = self.page_for(session_id).await?;
page.scroll_by(delta_y).await?;
Ok(format!("Scrolled {direction} {amount} pixels"))
}
async fn action_execute_js(&self, args: &Value, session_id: &str) -> Result<String, String> {
let script = args
.get("script")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: script".to_string())?;
let (page, _guard) = self.page_for(session_id).await?;
// Defense-in-depth: refuse to evaluate if the live committed URL is a
// blocked host (e.g. reached via post-load JS-redirect/meta-refresh).
// This runs AFTER the approval gate (which fires in `call()` before
// dispatch) but BEFORE the script is evaluated, so an approved execute_js
// still cannot read out a private host the page redirected to post-load.
self.ensure_current_url_allowed(&page).await?;
let result = page.evaluate(script).await?;
let value_str = match result {
Some(v) => serde_json::to_string_pretty(&v).unwrap_or_else(|_| format!("{:?}", v)),
None => "(no return value)".to_string(),
};
let value_str = crate::utils::truncate_with_note(&value_str, 4000);
// Apply secret redaction AFTER truncation so the redacted form is what
// reaches the user and event persistence — never the raw secret.
let value_str = crate::tools::sanitize::redact_secrets(&value_str);
Ok(value_str)
}
async fn action_wait(&self, args: &Value, session_id: &str) -> Result<String, String> {
let selector = args
.get("selector")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: selector".to_string())?;
// Condition: defaults to `present` (the historical behavior). Any unknown
// value is rejected up front so a typo doesn't silently fall through.
let condition = args
.get("condition")
.and_then(|v| v.as_str())
.unwrap_or("present");
let condition = WaitCondition::parse(condition)?;
// For `text_contains` a needle is required (accept either `text` or the
// shared `value` arg).
let needle = args
.get("text")
.and_then(|v| v.as_str())
.or_else(|| args.get("value").and_then(|v| v.as_str()));
if condition == WaitCondition::TextContains && needle.unwrap_or("").is_empty() {
return Err(
"Missing required parameter for condition 'text_contains': text (or value)"
.to_string(),
);
}
// Resolve the per-call timeout: a provided `timeout_secs` overrides the
// configured default, clamped to the same bound the config applies.
let timeout = match args.get("timeout_secs").and_then(|v| v.as_u64()) {
Some(secs) => Duration::from_secs(secs.clamp(1, MAX_ELEMENT_TIMEOUT_SECS)),
None => self.element_timeout,
};
let (page, _guard) = self.page_for(session_id).await?;
let deadline = tokio::time::Instant::now() + timeout;
let timeout_secs = timeout.as_secs();
loop {
// Evaluate the condition once. A connection-class error from a state
// probe is surfaced so `dispatch_with_recovery` can classify it; a
// benign "not yet" simply keeps polling.
match self
.evaluate_wait_condition(&page, condition, selector, needle)
.await
{
Ok(true) => return Ok(condition.success_message(selector, needle)),
Ok(false) => {}
Err(e) => {
if backend::is_connection_error(&e) {
return Err(e);
}
// Non-connection probe error (e.g. transient) — treat as
// "not satisfied yet" and keep polling within the deadline.
}
}
if tokio::time::Instant::now() >= deadline {
return Err(condition.timeout_message(selector, needle, timeout_secs));
}
tokio::time::sleep(WAIT_POLL_INTERVAL).await;
}
}
/// Evaluate a single `wait` condition against the page. Returns `Ok(true)`
/// when satisfied, `Ok(false)` when not-yet, `Err` on a probe failure.
///
/// Element state is checked WITHOUT interpolating the selector into a JS
/// source string (Task 14 rule): `present` uses the CDP DOM query
/// (`find_element`); `visible`/`hidden`/`enabled` use element-bound constant
/// predicates via the `PageHandle` state methods; `text_contains` reads the
/// element's own `inner_text`.
async fn evaluate_wait_condition(
&self,
page: &Arc<dyn PageHandle>,
condition: WaitCondition,
selector: &str,
needle: Option<&str>,
) -> Result<bool, String> {
match condition {
WaitCondition::Present => Ok(page.find_element(selector).await.is_ok()),
WaitCondition::Visible => page.is_element_visible(selector).await,
WaitCondition::Enabled => {
// Enabled implies present; an absent element reads as not-enabled
// via the state probe, so no extra presence check is needed.
page.is_element_enabled(selector).await
}
WaitCondition::Hidden => {
// Hidden == not visible (absent or laid-out-hidden). The
// visibility probe returns false for an absent element, so the
// negation covers both cases.
Ok(!page.is_element_visible(selector).await?)
}
WaitCondition::TextContains => {
let needle = needle.unwrap_or("");
// A missing element yields no text → not satisfied yet.
match page.inner_text(selector).await {
Ok(text) => Ok(text.contains(needle)),
Err(e) if backend::is_connection_error(&e) => Err(e),
Err(_) => Ok(false),
}
}
}
}
async fn action_set_mode(&self, args: &Value) -> Result<String, String> {
let mode = args.get("value").and_then(|v| v.as_str()).ok_or_else(|| {
"Missing required parameter: value (\"visible\" or \"headless\")".to_string()
})?;
let new_headless = match mode {
"visible" | "headed" => false,
"headless" => true,
_ => {
return Err(format!(
"Invalid mode '{}'. Use 'visible' or 'headless'.",
mode
))
}
};
self.backend.set_headless_mode(new_headless, mode).await
}
async fn action_close(&self) -> Result<String, String> {
// Route through the backend's graceful shutdown (launched →
// close()+wait()+timeout+fallback; attached → detach without a
// browser-close command). All cached session pages are now stale handles
// into a torn-down connection, so drop them too.
//
// NOTE (deferred follow-up): DAEMON-EXIT graceful close is NOT wired.
// Only `close`, `set_mode` (mode change), and idle eviction reuse the
// backend's graceful-teardown path; nothing calls it on process exit.
// Wiring a daemon-shutdown hook would require threading a concrete
// `Arc<BrowserTool>` into `core.rs`'s shutdown handler — out of scope and
// low value here: on process exit a LAUNCHED Chrome is reclaimed by OS
// teardown of its temp profile, and an ATTACHED Chrome is unaffected
// (we never send it a close command). Left as a documented follow-up.
let result = self.backend.shutdown().await;
self.sessions.invalidate_all_pages().await;
self.diagnostics.reset_attached().await;
result
}
/// Dispatch a single action to its handler (no recovery). Pulled out of
/// `call()` so the recovery wrapper can re-invoke it for an observation retry.
async fn dispatch_action(
&self,
action: &str,
args: &Value,
session_id: &str,
) -> Result<DispatchResult, String> {
match action {
"navigate" => self
.action_navigate(args, session_id)
.await
.map(DispatchResult::text_only),
"screenshot" => self.action_screenshot(args, session_id).await,
"click" => self
.action_click(args, session_id)
.await
.map(DispatchResult::text_only),
"fill" => self
.action_fill(args, session_id)
.await
.map(DispatchResult::text_only),
"get_text" => self
.action_get_text(args, session_id)
.await
.map(DispatchResult::text_only),
"scroll" => self
.action_scroll(args, session_id)
.await
.map(DispatchResult::text_only),
"execute_js" => self
.action_execute_js(args, session_id)
.await
.map(DispatchResult::text_only),
"wait" => self
.action_wait(args, session_id)
.await
.map(DispatchResult::text_only),
"list_tabs" => self
.action_list_tabs(session_id)
.await
.map(DispatchResult::text_only),
"get_console_logs" => self
.action_get_console_logs(args, session_id)
.await
.map(DispatchResult::text_only),
"get_network_errors" => self
.action_get_network_errors(args, session_id)
.await
.map(DispatchResult::text_only),
"new_tab" => self
.action_new_tab(args, session_id)
.await
.map(DispatchResult::text_only),
"switch_tab" => self
.action_switch_tab(args, session_id)
.await
.map(DispatchResult::text_only),
"close_tab" => self
.action_close_tab(args, session_id)
.await
.map(DispatchResult::text_only),
"set_mode" => self
.action_set_mode(args)
.await
.map(DispatchResult::text_only),
"close" => self.action_close().await.map(DispatchResult::text_only),
_ => Err(format!(
"Unknown browser action: '{}'. Valid actions: navigate, screenshot, click, fill, get_text, scroll, execute_js, wait, list_tabs, get_console_logs, get_network_errors, new_tab, switch_tab, close_tab, set_mode, close",
action
)),
}
}
/// Whether an action is safe to AUTOMATICALLY replay after a connection-class
/// failure + reconnect. Only observation/navigation/administrative actions
/// are idempotent enough to re-run blindly. Mutations (`click`, `fill`,
/// `execute_js`) may have PARTIALLY executed before the disconnect (the CDP
/// command could have reached Chrome and run before the websocket tore down),
/// so their state after a disconnect is UNCERTAIN — replaying could double a
/// submit/purchase/delete. We never auto-replay them.
///
/// Uses the shared `policy::classify` so the observation-vs-mutation boundary
/// has a single source of truth.
fn action_is_safe_to_replay(action: &str) -> bool {
let risk = policy::classify(action, None, None);
matches!(
risk.class,
BrowserRiskClass::Observation
| BrowserRiskClass::Navigation
| BrowserRiskClass::Administrative
)
}
/// Run an action with disconnect recovery layered on top of `dispatch_action`.
///
/// On a CONNECTION-CLASS error (per `backend::is_connection_error`) — the
/// websocket/CDP connection to Chrome died, distinct from an ordinary page
/// error like "element not found" — recovery splits by idempotency, which is
/// known HERE (the tool layer) via the action's risk class:
///
/// - **Observation / navigation / administrative (idempotent):** invalidate
/// ALL cached session pages (a dead browser kills every session's pages),
/// `reconnect()` ONCE, then retry the action ONE time against a fresh page.
/// If it fails again, surface that error.
/// - **Mutation (`click`/`fill`/`execute_js`):** NEVER auto-replay — the
/// action may have partially executed before the disconnect (uncertain
/// state). Still reconnect + invalidate so the NEXT action works, but
/// surface a clear "could not be confirmed; re-issue manually" error.
///
/// A non-connection error is returned verbatim with no reconnect.
async fn dispatch_with_recovery(
&self,
action: &str,
args: &Value,
session_id: &str,
) -> Result<DispatchResult, String> {
let first = self.dispatch_action(action, args, session_id).await;
let err = match first {
Ok(ok) => return Ok(ok),
Err(e) => e,
};
// Only connection-class failures trigger recovery. A normal page error
// ("Element not found", "Timeout", ...) is surfaced as-is.
if !backend::is_connection_error(&err) {
return Err(err);
}
warn!(
action,
"browser action hit a connection-class error; attempting recovery"
);
// A dead browser invalidates EVERY session's pages — drop them all so
// the next page resolution mints fresh handles against the new
// connection. Then reconnect exactly once.
self.sessions.invalidate_all_pages().await;
self.diagnostics.reset_attached().await;
if let Err(reconnect_err) = self.backend.reconnect().await {
return Err(format!(
"Browser connection lost and reconnect failed: {}. \
The action did not complete; please retry.",
reconnect_err
));
}
if Self::action_is_safe_to_replay(action) {
// Idempotent: retry once against a freshly-minted page.
info!(action, "retrying idempotent browser action after reconnect");
return self.dispatch_action(action, args, session_id).await;
}
// Mutation: NEVER auto-replay. The connection is restored for subsequent
// actions, but this action's effect is uncertain.
warn!(
action,
"mutation hit a disconnect; NOT replaying (uncertain state)"
);
Err(format!(
"Browser connection was lost while performing '{}'. The action could NOT be \
confirmed and may have partially completed — it was NOT retried automatically to \
avoid duplicating it. The connection has been restored; re-issue the action \
manually if needed after checking the page state.",
action
))
}
/// Resolve the tab id for diagnostics actions: explicit `tab_id` arg or the
/// session's active tab.
async fn resolve_tab_id(&self, args: &Value, session_id: &str) -> Result<String, String> {
if let Some(tab_id) = args.get("tab_id").and_then(|v| v.as_str()) {
if tab_id.is_empty() {
return Err("tab_id must not be empty".to_string());
}
let tabs = self.sessions.list_tabs(session_id).await;
if !tabs.iter().any(|t| t.tab_id == tab_id) {
return Err(format!(
"Unknown tab '{}'. It does not belong to this session. Use list_tabs to see open tabs.",
tab_id
));
}
return Ok(tab_id.to_string());
}
self.sessions
.active_target_id(session_id)
.await
.ok_or_else(|| {
"No active tab in this session. Use list_tabs or new_tab first.".to_string()
})
}
async fn action_get_console_logs(
&self,
args: &Value,
session_id: &str,
) -> Result<String, String> {
let (_page, _guard) = self.page_for(session_id).await?;
let tab_id = self.resolve_tab_id(args, session_id).await?;
Ok(self
.diagnostics
.format_console_logs(session_id, &tab_id)
.await)
}
async fn action_get_network_errors(
&self,
args: &Value,
session_id: &str,
) -> Result<String, String> {
let (_page, _guard) = self.page_for(session_id).await?;
let tab_id = self.resolve_tab_id(args, session_id).await?;
Ok(self
.diagnostics
.format_network_errors(session_id, &tab_id)
.await)
}
/// `list_tabs`: render this session's tabs — opaque id, title, REDACTED
/// origin (never the full URL — paths/queries can carry secrets), and which
/// is active. Ensures the session has at least one tab first (so a fresh
/// session reports its single page rather than "no tabs").
async fn action_list_tabs(&self, session_id: &str) -> Result<String, String> {
// Touch page_for to guarantee the session exists with its first tab.
let (_page, _guard) = self.page_for(session_id).await?;
let tabs = self.sessions.list_tabs(session_id).await;
if tabs.is_empty() {
return Ok("No open tabs.".to_string());
}
Ok(Self::format_tab_list(&tabs))
}
fn format_tab_list(tabs: &[TabView]) -> String {
let mut out = format!("Open tabs ({}):", tabs.len());
for tab in tabs {
let marker = if tab.active { " [active]" } else { "" };
let title = tab.title.as_deref().unwrap_or("(untitled)");
let origin = tab
.url
.as_deref()
.map(redact_origin)
.filter(|o| !o.is_empty())
.unwrap_or_else(|| "(no url)".to_string());
out.push_str(&format!(
"\n- {}{}: \"{}\" — {}",
tab.tab_id, marker, title, origin
));
}
out
}
/// `new_tab`: open a new tab (a new page in this session's context),
/// optionally navigating it to `url` (SSRF-validated). The new tab becomes
/// active, since opening a tab implies you want to use it. Returns its
/// opaque tab id.
async fn action_new_tab(&self, args: &Value, session_id: &str) -> Result<String, String> {
// Ensure the session exists (and has its first tab) before adding more.
let (_page, _guard) = self.page_for(session_id).await?;
let url = args.get("url").and_then(|v| v.as_str());
if let Some(url) = url {
// Pre-flight SSRF check (host class only — no URL/secret leak).
if let Err(blocked) = policy::validate_network_url(url) {
return Err(blocked.message());
}
}
let (target_id, context_id, page) = self.backend.create_page().await?;
if let Some(url) = url {
page.goto(url).await?;
// Bounded navigation readiness instead of a fixed 2s sleep (see
// `action_navigate`). Best-effort: a never-`load` page proceeds to
// URL revalidation; a connection error propagates.
if let Err(e) = page.wait_for_navigation(self.nav_timeout).await {
if backend::is_connection_error(&e) {
return Err(e);
}
warn!(error = %e, "new_tab navigation wait reported a non-connection error");
}
}
let current_url = page.url().await;
// Revalidate the new tab's FINAL committed URL the same way navigate
// does, so a redirect to a blocked host can't leave a live tab pointed
// at an internal address. Close the tab and report the host class only.
if let Some(ref final_url) = current_url {
if let Err(blocked) = policy::validate_network_url(final_url) {
warn!(
class = blocked.class.label(),
"new tab landed on a blocked host after redirect; closing"
);
// Best-effort backend cleanup — the tab was never registered.
let _ = self.backend.close_target(&target_id).await;
return Err(format!(
"Navigation blocked: redirected to a {}",
blocked.class.label()
));
}
}
let tab_id = self
.sessions
.add_tab(
session_id,
&target_id,
page,
current_url,
None,
context_id,
/* make_active */ true,
)
.await
.ok_or_else(|| "failed to register new tab for this session".to_string())?;
match url {
Some(url) => Ok(format!("Opened new tab {} at {}", tab_id, url)),
None => Ok(format!("Opened new tab {} (active)", tab_id)),
}
}
/// `switch_tab`: make `tab_id` the session's active tab. The id is validated
/// to belong to THIS session — a tab id from another session is rejected.
async fn action_switch_tab(&self, args: &Value, session_id: &str) -> Result<String, String> {
// Ensure the session exists before validating ownership.
let (_page, _guard) = self.page_for(session_id).await?;
let tab_id = args
.get("tab_id")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: tab_id".to_string())?;
let view = self.sessions.switch_tab(session_id, tab_id).await?;
let origin = view
.url
.as_deref()
.map(redact_origin)
.filter(|o| !o.is_empty())
.unwrap_or_else(|| "(no url)".to_string());
Ok(format!("Switched to tab {} — {}", view.tab_id, origin))
}
/// `close_tab`: close `tab_id` (validated to belong to this session) and
/// report the new active tab, if any remains.
async fn action_close_tab(&self, args: &Value, session_id: &str) -> Result<String, String> {
// Ensure the session exists before validating ownership.
let (_page, _guard) = self.page_for(session_id).await?;
let tab_id = args
.get("tab_id")
.and_then(|v| v.as_str())
.ok_or_else(|| "Missing required parameter: tab_id".to_string())?;
let (target_id, new_active) = self.sessions.close_tab(session_id, tab_id).await?;
// Best-effort backend close; the tab is already removed from the session
// so a backend failure doesn't leave a dangling session reference.
if let Err(e) = self.backend.close_target(&target_id).await {
warn!(tab_id, error = %e, "backend close_target failed after session removal");
}
self.diagnostics.drop_tab(session_id, tab_id).await;
match new_active {
Some(active) => Ok(format!(
"Closed tab {}. Active tab is now {}.",
tab_id, active
)),
None => Ok(format!(
"Closed tab {}. No tabs remain open in this session.",
tab_id
)),
}
}
async fn run_action(&self, arguments: &str) -> anyhow::Result<DispatchResult> {
let args: Value = serde_json::from_str(arguments)?;
let action = args
.get("action")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing required parameter: action"))?;
let session_id = args
.get("_session_id")
.and_then(|v| v.as_str())
.unwrap_or("");
let needs_session = !matches!(action, "close" | "set_mode");
if needs_session && session_id.is_empty() {
return Ok(DispatchResult::text_only(
"Error: browser actions require a session id".to_string(),
));
}
if action == "execute_js" {
let script = args.get("script").and_then(|v| v.as_str()).unwrap_or("");
if let Err(reason) = validate_script_constraints(script) {
warn!(action, "execute_js script rejected by constraint check");
return Ok(DispatchResult::text_only(format!("Error: {}", reason)));
}
}
let action_args = ActionArgs {
action,
url: args.get("url").and_then(|v| v.as_str()),
selector: args.get("selector").and_then(|v| v.as_str()),
script: args.get("script").and_then(|v| v.as_str()),
tab_id: args.get("tab_id").and_then(|v| v.as_str()),
session_id,
};
if let GateDecision::Deny(reason) = self.approval_gate(&action_args).await {
warn!(action, "Browser action blocked by approval gate");
return Ok(DispatchResult::text_only(format!("Error: {}", reason)));
}
match self.dispatch_with_recovery(action, &args, session_id).await {
Ok(result) => Ok(result),
Err(err_text) => {
warn!(action, error = %err_text, "Browser action failed");
Ok(DispatchResult::text_only(format!("Error: {}", err_text)))
}
}
}
}
#[async_trait]
impl Tool for BrowserTool {
fn name(&self) -> &str {
"browser"
}
fn description(&self) -> &str {
"Control a browser to navigate pages, click elements, fill forms, scroll, take screenshots, extract text, and execute JavaScript. Supports headless and visible modes."
}
fn schema(&self) -> Value {
json!({
"name": "browser",
"description": "Control a browser for web interactions. Actions: navigate (go to URL), screenshot (capture page as photo), click (click element — reports a new tab id if the click opened one), fill (type into input), get_text (extract text), scroll (move the active page up or down), execute_js (run JavaScript), wait (wait for an element condition: present/visible/enabled/hidden/text_contains), list_tabs (list this session's open tabs with their ids), get_console_logs (read captured console output for a tab), get_network_errors (read captured network load failures for a tab), new_tab (open and switch to a new tab, optionally at a url), switch_tab (make a tab active by its id), close_tab (close a tab by its id), set_mode (switch between 'visible' and 'headless' — use visible for sites that block headless browsers), close (end session). The browser persists across calls for multi-step workflows. Tab ids are opaque tokens returned by list_tabs/new_tab; do not guess them.",
"parameters": {
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": ["navigate", "screenshot", "click", "fill", "get_text", "scroll", "execute_js", "wait", "list_tabs", "get_console_logs", "get_network_errors", "new_tab", "switch_tab", "close_tab", "set_mode", "close"],
"description": "The browser action to perform"
},
"url": {
"type": "string",
"description": "URL to navigate to (for 'navigate', or optionally for 'new_tab')"
},
"selector": {
"type": "string",
"description": "CSS selector for the target element (for click, fill, get_text, wait, screenshot)"
},
"full_page": {
"type": "boolean",
"description": "For 'screenshot' WITHOUT a selector: capture the entire scrollable page instead of just the visible viewport (default false). Full-page captures of long pages may be too large to deliver and will be rejected — prefer the default viewport or a selector."
},
"value": {
"type": "string",
"description": "Text to type (for 'fill') or mode to set (for 'set_mode': 'visible' or 'headless')"
},
"direction": {
"type": "string",
"enum": ["up", "down"],
"description": "Direction for 'scroll' (default: down)"
},
"amount": {
"type": "integer",
"minimum": 1,
"maximum": 5000,
"description": "Distance in pixels for 'scroll' (default: 700)"
},
"script": {
"type": "string",
"description": "JavaScript code to execute (for 'execute_js' action)"
},
"timeout_secs": {
"type": "integer",
"description": "Timeout in seconds for 'wait' action (default from config, clamped 1..=120)"
},
"condition": {
"type": "string",
"enum": ["present", "visible", "enabled", "hidden", "text_contains"],
"description": "Condition for 'wait' (default: present). present=in DOM; visible=laid out & not hidden; enabled=not disabled; hidden=absent or hidden; text_contains=element text contains 'text'."
},
"text": {
"type": "string",
"description": "Needle for the 'wait' action's 'text_contains' condition (the substring to wait for in the element's text)"
},
"tab_id": {
"type": "string",
"description": "Opaque tab id from list_tabs/new_tab (required for 'switch_tab' and 'close_tab'; optional for 'get_console_logs' and 'get_network_errors' — defaults to the active tab)"
}
},
"required": ["action"],
"additionalProperties": false
}
})
}
async fn call(&self, arguments: &str) -> anyhow::Result<String> {
Ok(self.run_action(arguments).await?.text)
}
async fn call_with_status_outcome(
&self,
arguments: &str,
status_tx: Option<tokio::sync::mpsc::Sender<crate::types::StatusUpdate>>,
) -> anyhow::Result<ToolCallOutcome> {
let _ = status_tx;
let result = self.run_action(arguments).await?;
Ok(ToolCallOutcome {
output: result.text,
metadata: ToolCallMetadata {
attachments: result.attachments,
..ToolCallMetadata::default()
},
})
}
fn capabilities(&self) -> ToolCapabilities {
ToolCapabilities {
read_only: false,
external_side_effect: true,
needs_approval: true,
idempotent: false,
high_impact_write: false,
}
}
fn call_semantics(&self, arguments: &str) -> ToolCallSemantics {
let args = serde_json::from_str::<Value>(arguments).ok();
let action = args
.as_ref()
.and_then(|value| value.get("action"))
.and_then(|value| value.as_str())
.map(|value| value.trim().to_ascii_lowercase());
let url = args
.as_ref()
.and_then(|value| value.get("url"))
.and_then(|value| value.as_str())
.unwrap_or_default();
match action.as_deref() {
Some("navigate") => {
ToolCallSemantics::observation().with_target_hint(ToolTargetHintKind::Url, url)
}
Some("get_text") => ToolCallSemantics::observation()
.with_verification_mode(ToolVerificationMode::ResultContent),
Some("scroll") => ToolCallSemantics::observation(),
Some("wait") => ToolCallSemantics::observation()
.with_verification_mode(ToolVerificationMode::ResultContent),
Some("screenshot") => ToolCallSemantics::observation(),
// list_tabs just reads the session's tab set — pure observation.
Some("list_tabs") => ToolCallSemantics::observation(),
Some("get_console_logs" | "get_network_errors") => ToolCallSemantics::observation()
.with_verification_mode(ToolVerificationMode::ResultContent),
// new_tab/switch_tab change which page subsequent actions target,
// mirroring navigate's observation classification (they don't mutate
// page content, they reposition the session).
Some("new_tab" | "switch_tab") => ToolCallSemantics::observation(),
Some("click" | "fill" | "execute_js") => ToolCallSemantics::mutation(),
// close_tab tears down session state — administrative, like close.
Some("close" | "set_mode" | "close_tab") => ToolCallSemantics::administrative(),
_ => ToolCallSemantics::mutation(),
}
}
}
#[cfg(test)]
mod prompt_tests {
use super::format_browser_approval_prompt;
use crate::tools::browser::policy::{self, BrowserRiskClass};
fn sample_risk() -> policy::BrowserActionRisk {
policy::BrowserActionRisk {
class: BrowserRiskClass::Navigation,
sensitive: false,
consequential: false,
}
}
#[test]
fn navigate_prompt_is_plain_language() {
let prompt = format_browser_approval_prompt(
"navigate",
"https://newtarget.com",
None,
None,
None,
&sample_risk(),
);
assert_eq!(prompt, "Open website: https://newtarget.com");
assert!(!prompt.contains("[target:"));
assert!(!prompt.contains("[risk:"));
}
#[test]
fn execute_js_prompt_hides_script_body() {
let prompt = format_browser_approval_prompt(
"execute_js",
"https://example.com",
None,
None,
Some(512),
&sample_risk(),
);
assert_eq!(prompt, "Run JavaScript on https://example.com (512 bytes)");
}
}