Skip to main content

oxi_agent/tools/browse/
browse_extract_tool.rs

1//! Browse extract tool — extract structured data from a web page.
2//!
3//! All extraction is done via the already-loaded tab's JavaScript engine —
4//! no engine-level methods that would open additional tabs.
5
6use super::config::BrowseConfig;
7use super::engine::{BrowserEngine, BrowserError, BrowserTab};
8use super::helpers;
9use super::tab_guard::TabGuard;
10use crate::tools::{AgentTool, AgentToolResult, ToolContext, ToolError};
11use async_trait::async_trait;
12use serde_json::{json, Value};
13use std::sync::Arc;
14use tokio::sync::oneshot;
15
16/// Extract structured data from a web page using CSS selectors.
17///
18/// Returns links, text content, or element metadata for all elements
19/// matching the given CSS selector.
20pub struct BrowseExtractTool {
21    engine: Arc<dyn BrowserEngine>,
22    config: BrowseConfig,
23}
24
25impl BrowseExtractTool {
26    /// Create with the given engine and default config.
27    pub fn new(engine: Arc<dyn BrowserEngine>) -> Self {
28        Self {
29            engine,
30            config: BrowseConfig::default(),
31        }
32    }
33
34    /// Create with custom configuration.
35    pub fn with_config(engine: Arc<dyn BrowserEngine>, config: BrowseConfig) -> Self {
36        Self { engine, config }
37    }
38}
39
40#[async_trait]
41impl AgentTool for BrowseExtractTool {
42    fn name(&self) -> &str {
43        "browse_extract"
44    }
45
46    fn label(&self) -> &str {
47        "Extract Page Data"
48    }
49
50    fn description(&self) -> &str {
51        "Extract structured data from a web page: links, text content, or elements matching \
52         a CSS selector. Use when you need specific data from a page rather than the full content. \
53         Supports extracting all matching elements or just the first match."
54    }
55
56    fn parameters_schema(&self) -> Value {
57        json!({
58            "type": "object",
59            "properties": {
60                "url": {
61                    "type": "string",
62                    "description": "URL of the page to extract from"
63                },
64                "selector": {
65                    "type": "string",
66                    "description": "CSS selector to match elements"
67                },
68                "extract": {
69                    "type": "string",
70                    "enum": ["links", "text", "elements", "markdown"],
71                    "default": "text",
72                    "description": "What to extract: 'links' (href + text), 'text' (textContent), 'elements' (tag + text + attrs), 'markdown' (innerHTML as markdown)"
73                },
74                "all": {
75                    "type": "boolean",
76                    "default": true,
77                    "description": "Return all matches (true) or just the first (false)"
78                },
79                "timeout": {
80                    "type": "integer",
81                    "default": 30,
82                    "description": "Maximum time in seconds"
83                }
84            },
85            "required": ["url", "selector"]
86        })
87    }
88
89    async fn execute(
90        &self,
91        _tool_call_id: &str,
92        params: Value,
93        _signal: Option<oneshot::Receiver<()>>,
94        _ctx: &ToolContext,
95    ) -> Result<AgentToolResult, ToolError> {
96        let url = params["url"]
97            .as_str()
98            .ok_or_else(|| "Missing required parameter: url".to_string())?;
99
100        let selector = params["selector"]
101            .as_str()
102            .ok_or_else(|| "Missing required parameter: selector".to_string())?;
103
104        let extract = params["extract"].as_str().unwrap_or("text");
105        let all = params["all"].as_bool().unwrap_or(true);
106        let timeout_secs = params["timeout"]
107            .as_u64()
108            .unwrap_or(self.config.page_timeout_secs);
109
110        tracing::info!(url = %url, selector = %selector, extract = %extract, "extracting page data");
111
112        // Wrap the entire operation in a timeout
113        let output = tokio::time::timeout(
114            std::time::Duration::from_secs(timeout_secs),
115            self.extract_from_new_tab(url, selector, extract, all),
116        )
117        .await
118        .map_err(|_| format!("Extract timed out after {}s", timeout_secs))??;
119
120        Ok(output)
121    }
122}
123
124impl BrowseExtractTool {
125    /// Open a tab, navigate, extract, close — one tab, one flow.
126    async fn extract_from_new_tab(
127        &self,
128        url: &str,
129        selector: &str,
130        extract: &str,
131        all: bool,
132    ) -> Result<AgentToolResult, ToolError> {
133        let raw_tab = self
134            .engine
135            .new_tab()
136            .await
137            .map_err(|e| format!("Failed to open browser tab: {}", e))?;
138        let guard = TabGuard::new(raw_tab);
139
140        let page = guard
141            .tab()
142            .goto(url)
143            .await
144            .map_err(|e| format!("Navigation failed: {}", e))?;
145
146        let output = extract_from_tab(guard.tab(), selector, extract, all)
147            .await
148            .map_err(|e: BrowserError| e.to_string())?;
149
150        let metadata_url = page.url.clone();
151        let metadata_title = page.title.clone();
152
153        guard.close().await;
154
155        Ok(AgentToolResult::success(output).with_metadata(json!({
156            "url": metadata_url,
157            "title": metadata_title,
158            "selector": selector,
159            "extract": extract,
160        })))
161    }
162}
163
164// ── Extraction logic ──────────────────────────────────────────────────────────
165
166async fn extract_from_tab(
167    tab: &dyn BrowserTab,
168    selector: &str,
169    extract: &str,
170    all: bool,
171) -> Result<String, BrowserError> {
172    match extract {
173        "links" => {
174            let js = helpers::js_links_within(selector);
175            let value = tab.evaluate(&js).await?;
176            let links = helpers::parse_link_values(value);
177            let links = if all {
178                links
179            } else {
180                links.into_iter().take(1).collect()
181            };
182            let json_links: Vec<Value> = links
183                .iter()
184                .map(|(t, h)| json!({ "text": t, "href": h }))
185                .collect();
186            Ok(serde_json::to_string_pretty(&json_links).unwrap_or_default())
187        }
188        "elements" => {
189            let js = helpers::js_query_elements(selector);
190            let value = tab.evaluate(&js).await?;
191            let elements = helpers::parse_element_values(value);
192            let elements = if all {
193                elements
194            } else {
195                elements.into_iter().take(1).collect()
196            };
197            let json_elems: Vec<Value> = elements
198                .iter()
199                .map(|(tag, text, attrs)| json!({ "tag": tag, "text": text, "attributes": attrs }))
200                .collect();
201            Ok(serde_json::to_string_pretty(&json_elems).unwrap_or_default())
202        }
203        "markdown" => {
204            let texts = tab.query_all(selector).await?;
205            let texts = if all {
206                texts
207            } else {
208                texts.into_iter().take(1).collect()
209            };
210            Ok(texts.join("\n\n"))
211        }
212        _ => {
213            // "text" (default)
214            let texts = tab.query_all(selector).await?;
215            let texts = if all {
216                texts
217            } else {
218                texts.into_iter().take(1).collect()
219            };
220            Ok(texts.join("\n"))
221        }
222    }
223}