Skip to main content

thulp_browser/
lib.rs

1//! # thulp-browser
2//!
3//! Web browser automation and scraping utilities for thulp.
4//!
5//! This crate provides tools for:
6//! - Web page fetching and parsing
7//! - HTML content extraction
8//! - Basic web scraping operations
9//! - CDP (Chrome DevTools Protocol) browser automation (feature-gated)
10//!
11//! ## Basic Web Fetching
12//!
13//! ```rust,no_run
14//! use thulp_browser::WebClient;
15//!
16//! # async fn example() -> Result<(), thulp_browser::BrowserError> {
17//! let client = WebClient::new();
18//! let page = client.fetch("https://example.com").await?;
19//! println!("Title: {:?}", page.title);
20//! println!("Text: {}", page.text());
21//! # Ok(())
22//! # }
23//! ```
24//!
25//! ## CDP Browser Automation (requires `cdp` feature)
26//!
27//! ```rust,ignore
28//! use thulp_browser::cdp::{Browser, BrowserConfig};
29//!
30//! # async fn example() -> Result<(), thulp_browser::BrowserError> {
31//! let config = BrowserConfig::new().headless(true);
32//! let browser = Browser::launch(config).await?;
33//! let page = browser.new_page().await?;
34//! page.navigate("https://example.com").await?;
35//! let screenshot = page.screenshot().await?;
36//! # Ok(())
37//! # }
38//! ```
39
40use serde::{Deserialize, Serialize};
41
42/// Result type for browser operations
43pub type Result<T> = std::result::Result<T, BrowserError>;
44
45/// Errors that can occur in browser operations
46#[derive(Debug, thiserror::Error)]
47pub enum BrowserError {
48    #[error("HTTP error: {0}")]
49    Http(String),
50
51    #[error("Parse error: {0}")]
52    Parse(String),
53
54    #[error("Invalid URL: {0}")]
55    InvalidUrl(String),
56
57    #[error("CDP connection error: {0}")]
58    CdpConnection(String),
59
60    #[error("CDP protocol error: {0}")]
61    CdpProtocol(String),
62
63    #[error("Browser launch failed: {0}")]
64    BrowserLaunch(String),
65
66    #[error("Page not found: {0}")]
67    PageNotFound(String),
68
69    #[error("Navigation failed: {0}")]
70    Navigation(String),
71
72    #[error("JavaScript evaluation failed: {0}")]
73    JavaScriptEval(String),
74
75    #[error("Screenshot capture failed: {0}")]
76    Screenshot(String),
77
78    #[error("Timeout: {0}")]
79    Timeout(String),
80}
81
82/// Web page content
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct Page {
85    /// The URL of the page
86    pub url: String,
87
88    /// The HTML content
89    pub html: String,
90
91    /// The page title (if found)
92    pub title: Option<String>,
93
94    /// HTTP status code
95    pub status: u16,
96}
97
98impl Page {
99    /// Create a new page
100    pub fn new(url: String, html: String, status: u16) -> Self {
101        let title = extract_title(&html);
102        Self {
103            url,
104            html,
105            title,
106            status,
107        }
108    }
109
110    /// Extract text content from the HTML
111    pub fn text(&self) -> String {
112        // Simple text extraction - in a real implementation would use html5ever or similar
113        strip_html_tags(&self.html)
114    }
115
116    /// Get the content length
117    pub fn len(&self) -> usize {
118        self.html.len()
119    }
120
121    /// Check if the page is empty
122    pub fn is_empty(&self) -> bool {
123        self.html.is_empty()
124    }
125}
126
127/// Simple web client for fetching pages
128pub struct WebClient {
129    /// HTTP client
130    client: reqwest::Client,
131}
132
133impl WebClient {
134    /// Create a new web client
135    pub fn new() -> Self {
136        Self {
137            client: reqwest::Client::new(),
138        }
139    }
140
141    /// Fetch a web page
142    pub async fn fetch(&self, url: &str) -> Result<Page> {
143        let response = self
144            .client
145            .get(url)
146            .send()
147            .await
148            .map_err(|e| BrowserError::Http(e.to_string()))?;
149
150        let status = response.status().as_u16();
151        let html = response
152            .text()
153            .await
154            .map_err(|e| BrowserError::Http(e.to_string()))?;
155
156        Ok(Page::new(url.to_string(), html, status))
157    }
158}
159
160impl Default for WebClient {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166/// Extract title from HTML content
167fn extract_title(html: &str) -> Option<String> {
168    // Simple regex-based title extraction
169    let title_start = html.find("<title>")?;
170    let title_end = html[title_start..].find("</title>")?;
171    let title = &html[title_start + 7..title_start + title_end];
172    Some(title.trim().to_string())
173}
174
175/// Strip HTML tags from content
176fn strip_html_tags(html: &str) -> String {
177    // Simple tag stripping - in production would use proper HTML parser
178    let mut result = String::new();
179    let mut in_tag = false;
180
181    for ch in html.chars() {
182        match ch {
183            '<' => in_tag = true,
184            '>' => in_tag = false,
185            _ if !in_tag => result.push(ch),
186            _ => {}
187        }
188    }
189
190    result
191}
192
193/// CDP browser automation module.
194#[cfg(feature = "cdp")]
195pub mod cdp;
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200
201    #[test]
202    fn test_page_creation() {
203        let page = Page::new(
204            "https://example.com".to_string(),
205            "<html><title>Test</title><body>Content</body></html>".to_string(),
206            200,
207        );
208
209        assert_eq!(page.url, "https://example.com");
210        assert_eq!(page.status, 200);
211        assert_eq!(page.title, Some("Test".to_string()));
212    }
213
214    #[test]
215    fn test_extract_title() {
216        let html = "<html><head><title>Test Title</title></head></html>";
217        assert_eq!(extract_title(html), Some("Test Title".to_string()));
218    }
219
220    #[test]
221    fn test_strip_html_tags() {
222        let html = "<html><body><p>Hello <b>World</b></p></body></html>";
223        let text = strip_html_tags(html);
224        assert!(text.contains("Hello"));
225        assert!(text.contains("World"));
226        assert!(!text.contains("<"));
227    }
228
229    #[test]
230    fn test_web_client_creation() {
231        let _client = WebClient::new();
232        assert!(true); // Just verify it can be created
233    }
234
235    #[test]
236    fn test_browser_error_display() {
237        let err = BrowserError::CdpConnection("failed to connect".to_string());
238        assert_eq!(err.to_string(), "CDP connection error: failed to connect");
239    }
240
241    #[test]
242    fn test_page_text() {
243        let page = Page::new(
244            "https://example.com".to_string(),
245            "<p>Hello</p><p>World</p>".to_string(),
246            200,
247        );
248        let text = page.text();
249        assert!(text.contains("Hello"));
250        assert!(text.contains("World"));
251    }
252
253    #[test]
254    fn test_page_len() {
255        let page = Page::new(
256            "https://example.com".to_string(),
257            "<p>Test</p>".to_string(),
258            200,
259        );
260        assert_eq!(page.len(), 11);
261        assert!(!page.is_empty());
262    }
263}