Skip to main content

oxi_agent/tools/browse/
browse_extract_tool.rs

1//! Browse extract tool — extract structured data from a web page.
2//!
3//! All extraction is done via the already-loaded tab's JavaScript engine —
4//! no engine-level methods that would open additional tabs.
5
6use super::config::BrowseConfig;
7use super::engine::{BrowserEngine, BrowserError, BrowserTab};
8use super::helpers;
9use super::tab_guard::TabGuard;
10use crate::tools::{AgentTool, AgentToolResult, ToolContext, ToolError};
11use async_trait::async_trait;
12use serde_json::{json, Value};
13use parking_lot::Mutex;
14use std::sync::Arc;
15use tokio::sync::oneshot;
16
17/// Extract structured data from a web page using CSS selectors.
18///
19/// Returns links, text content, or element metadata for all elements
20/// matching the given CSS selector.
21pub struct BrowseExtractTool {
22    engine: Arc<dyn BrowserEngine>,
23    config: BrowseConfig,
24    /// Shared callback management (progress + browse progress).
25    callbacks: super::callback_mixin::BrowseCallbacks,
26    /// Shared slot for the current tab's ID.
27    tab_id_slot: Mutex<Arc<parking_lot::Mutex<Option<uuid::Uuid>>>>,
28}
29
30impl BrowseExtractTool {
31    /// Create with the given engine and default config.
32    pub fn new(engine: Arc<dyn BrowserEngine>) -> Self {
33        Self {
34            engine,
35            config: BrowseConfig::default(),
36            callbacks: super::callback_mixin::BrowseCallbacks::new(),
37            tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
38        }
39    }
40
41    /// Create with custom configuration.
42    pub fn with_config(engine: Arc<dyn BrowserEngine>, config: BrowseConfig) -> Self {
43        Self {
44            engine,
45            config,
46            callbacks: super::callback_mixin::BrowseCallbacks::new(),
47            tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
48        }
49    }
50}
51
52#[async_trait]
53impl AgentTool for BrowseExtractTool {
54    fn name(&self) -> &str {
55        "browse_extract"
56    }
57
58    fn label(&self) -> &str {
59        "Extract Page Data"
60    }
61
62    fn description(&self) -> &str {
63        "Extract structured data from a web page: links, text content, or elements matching \
64         a CSS selector. Use when you need specific data from a page rather than the full content. \
65         Supports extracting all matching elements or just the first match."
66    }
67
68    fn on_progress(&self, callback: crate::tools::ProgressCallback) {
69        self.callbacks.store_progress(callback);
70    }
71
72    fn on_browse_progress(
73        &self,
74        callback: Arc<dyn Fn(super::BrowseProgress) + Send + Sync>,
75    ) {
76        self.callbacks.store_browse(callback);
77    }
78
79    fn set_tab_id_slot(&self, slot: Arc<parking_lot::Mutex<Option<uuid::Uuid>>>) {
80        *self.tab_id_slot.lock() = slot;
81    }
82
83    fn current_tab_id(&self) -> Option<uuid::Uuid> {
84        *self.tab_id_slot.lock().lock()
85    }
86
87    fn parameters_schema(&self) -> Value {
88        json!({
89            "type": "object",
90            "properties": {
91                "url": {
92                    "type": "string",
93                    "description": "URL of the page to extract from"
94                },
95                "selector": {
96                    "type": "string",
97                    "description": "CSS selector to match elements"
98                },
99                "extract": {
100                    "type": "string",
101                    "enum": ["links", "text", "elements", "markdown"],
102                    "default": "text",
103                    "description": "What to extract: 'links' (href + text), 'text' (textContent), 'elements' (tag + text + attrs), 'markdown' (innerHTML as markdown)"
104                },
105                "all": {
106                    "type": "boolean",
107                    "default": true,
108                    "description": "Return all matches (true) or just the first (false)"
109                },
110                "timeout": {
111                    "type": "integer",
112                    "default": 30,
113                    "description": "Maximum time in seconds"
114                }
115            },
116            "required": ["url", "selector"]
117        })
118    }
119
120    async fn execute(
121        &self,
122        _tool_call_id: &str,
123        params: Value,
124        _signal: Option<oneshot::Receiver<()>>,
125        _ctx: &ToolContext,
126    ) -> Result<AgentToolResult, ToolError> {
127        let url = params["url"]
128            .as_str()
129            .ok_or_else(|| "Missing required parameter: url".to_string())?;
130
131        let selector = params["selector"]
132            .as_str()
133            .ok_or_else(|| "Missing required parameter: selector".to_string())?;
134
135        let extract = params["extract"].as_str().unwrap_or("text");
136        let all = params["all"].as_bool().unwrap_or(true);
137        let timeout_secs = params["timeout"]
138            .as_u64()
139            .unwrap_or(self.config.page_timeout_secs);
140
141        tracing::info!(url = %url, selector = %selector, extract = %extract, "extracting page data");
142
143        // Wrap the entire operation in a timeout
144        let output = tokio::time::timeout(
145            std::time::Duration::from_secs(timeout_secs),
146            self.extract_from_new_tab(url, selector, extract, all),
147        )
148        .await
149        .map_err(|_| format!("Extract timed out after {}s", timeout_secs))??;
150
151        Ok(output)
152    }
153}
154
155impl BrowseExtractTool {
156    /// Open a tab, navigate, extract, close — one tab, one flow.
157    async fn extract_from_new_tab(
158        &self,
159        url: &str,
160        selector: &str,
161        extract: &str,
162        all: bool,
163    ) -> Result<AgentToolResult, ToolError> {
164        let raw_tab = self
165            .engine
166            .new_tab()
167            .await
168            .map_err(|e| format!("Failed to open browser tab: {}", e))?;
169
170        // Store tab_id so the agent loop can include it in
171        // ToolExecutionUpdate events.
172        let tab_id = raw_tab.tab_id();
173        *self.tab_id_slot.lock().lock() = Some(tab_id);
174
175        // Register progress callbacks on the tab via the engine's registry.
176        self.callbacks.register_on_registry(
177            tab_id,
178            self.engine.callback_registry().as_ref(),
179        );
180
181        let guard = TabGuard::new(raw_tab);
182
183        let page = guard
184            .tab()
185            .goto(url)
186            .await
187            .map_err(|e| format!("Navigation failed: {}", e))?;
188
189        let output = extract_from_tab(guard.tab(), selector, extract, all)
190            .await
191            .map_err(|e: BrowserError| e.to_string())?;
192
193        let metadata_url = page.url.clone();
194        let metadata_title = page.title.clone();
195        let result_count = count_extracted_items(&output, extract);
196
197        guard.close().await;
198        *self.tab_id_slot.lock().lock() = None;
199
200        Ok(AgentToolResult::success(output).with_metadata(json!({
201            "url": metadata_url,
202            "title": metadata_title,
203            "selector": selector,
204            "extract": extract,
205            "result_count": result_count,
206        })))
207    }
208}
209
210// ── Extraction logic ──────────────────────────────────────────────────────────
211
212/// Count items in extraction output for metadata.
213fn count_extracted_items(output: &str, extract: &str) -> usize {
214    match extract {
215        "links" | "elements" => {
216            // JSON array output — count top-level array elements.
217            serde_json::from_str::<Vec<serde_json::Value>>(output)
218                .map(|v| v.len())
219                .unwrap_or(0)
220        }
221        _ => {
222            // Text/markdown — count non-empty lines.
223            output.lines().filter(|l| !l.trim().is_empty()).count()
224        }
225    }
226}
227
228async fn extract_from_tab(
229    tab: &dyn BrowserTab,
230    selector: &str,
231    extract: &str,
232    all: bool,
233) -> Result<String, BrowserError> {
234    match extract {
235        "links" => {
236            let js = helpers::js_links_within(selector);
237            let value = tab.evaluate(&js).await?;
238            let links = helpers::parse_link_values(value);
239            let links = if all {
240                links
241            } else {
242                links.into_iter().take(1).collect()
243            };
244            let json_links: Vec<Value> = links
245                .iter()
246                .map(|(t, h)| json!({ "text": t, "href": h }))
247                .collect();
248            Ok(serde_json::to_string_pretty(&json_links).unwrap_or_default())
249        }
250        "elements" => {
251            let js = helpers::js_query_elements(selector);
252            let value = tab.evaluate(&js).await?;
253            let elements = helpers::parse_element_values(value);
254            let elements = if all {
255                elements
256            } else {
257                elements.into_iter().take(1).collect()
258            };
259            let json_elems: Vec<Value> = elements
260                .iter()
261                .map(|(tag, text, attrs)| json!({ "tag": tag, "text": text, "attributes": attrs }))
262                .collect();
263            Ok(serde_json::to_string_pretty(&json_elems).unwrap_or_default())
264        }
265        "markdown" => {
266            let texts = tab.query_all(selector).await?;
267            let texts = if all {
268                texts
269            } else {
270                texts.into_iter().take(1).collect()
271            };
272            Ok(texts.join("\n\n"))
273        }
274        _ => {
275            // "text" (default)
276            let texts = tab.query_all(selector).await?;
277            let texts = if all {
278                texts
279            } else {
280                texts.into_iter().take(1).collect()
281            };
282            Ok(texts.join("\n"))
283        }
284    }
285}