thulp_browser/
lib.rs

1//! # thulp-browser
2//!
3//! Web browser automation and scraping utilities for thulp.
4//!
5//! This crate provides tools for:
6//! - Web page fetching and parsing
7//! - HTML content extraction
8//! - Basic web scraping operations
9
10use serde::{Deserialize, Serialize};
11
12/// Result type for browser operations
13pub type Result<T> = std::result::Result<T, BrowserError>;
14
15/// Errors that can occur in browser operations
16#[derive(Debug, thiserror::Error)]
17pub enum BrowserError {
18    #[error("HTTP error: {0}")]
19    Http(String),
20
21    #[error("Parse error: {0}")]
22    Parse(String),
23
24    #[error("Invalid URL: {0}")]
25    InvalidUrl(String),
26}
27
28/// Web page content
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct Page {
31    /// The URL of the page
32    pub url: String,
33
34    /// The HTML content
35    pub html: String,
36
37    /// The page title (if found)
38    pub title: Option<String>,
39
40    /// HTTP status code
41    pub status: u16,
42}
43
44impl Page {
45    /// Create a new page
46    pub fn new(url: String, html: String, status: u16) -> Self {
47        let title = extract_title(&html);
48        Self {
49            url,
50            html,
51            title,
52            status,
53        }
54    }
55
56    /// Extract text content from the HTML
57    pub fn text(&self) -> String {
58        // Simple text extraction - in a real implementation would use html5ever or similar
59        strip_html_tags(&self.html)
60    }
61
62    /// Get the content length
63    pub fn len(&self) -> usize {
64        self.html.len()
65    }
66
67    /// Check if the page is empty
68    pub fn is_empty(&self) -> bool {
69        self.html.is_empty()
70    }
71}
72
73/// Simple web client for fetching pages
74pub struct WebClient {
75    /// HTTP client
76    client: reqwest::Client,
77}
78
79impl WebClient {
80    /// Create a new web client
81    pub fn new() -> Self {
82        Self {
83            client: reqwest::Client::new(),
84        }
85    }
86
87    /// Fetch a web page
88    pub async fn fetch(&self, url: &str) -> Result<Page> {
89        let response = self
90            .client
91            .get(url)
92            .send()
93            .await
94            .map_err(|e| BrowserError::Http(e.to_string()))?;
95
96        let status = response.status().as_u16();
97        let html = response
98            .text()
99            .await
100            .map_err(|e| BrowserError::Http(e.to_string()))?;
101
102        Ok(Page::new(url.to_string(), html, status))
103    }
104}
105
106impl Default for WebClient {
107    fn default() -> Self {
108        Self::new()
109    }
110}
111
112/// Extract title from HTML content
113fn extract_title(html: &str) -> Option<String> {
114    // Simple regex-based title extraction
115    let title_start = html.find("<title>")?;
116    let title_end = html[title_start..].find("</title>")?;
117    let title = &html[title_start + 7..title_start + title_end];
118    Some(title.trim().to_string())
119}
120
121/// Strip HTML tags from content
122fn strip_html_tags(html: &str) -> String {
123    // Simple tag stripping - in production would use proper HTML parser
124    let mut result = String::new();
125    let mut in_tag = false;
126
127    for ch in html.chars() {
128        match ch {
129            '<' => in_tag = true,
130            '>' => in_tag = false,
131            _ if !in_tag => result.push(ch),
132            _ => {}
133        }
134    }
135
136    result
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142
143    #[test]
144    fn test_page_creation() {
145        let page = Page::new(
146            "https://example.com".to_string(),
147            "<html><title>Test</title><body>Content</body></html>".to_string(),
148            200,
149        );
150
151        assert_eq!(page.url, "https://example.com");
152        assert_eq!(page.status, 200);
153        assert_eq!(page.title, Some("Test".to_string()));
154    }
155
156    #[test]
157    fn test_extract_title() {
158        let html = "<html><head><title>Test Title</title></head></html>";
159        assert_eq!(extract_title(html), Some("Test Title".to_string()));
160    }
161
162    #[test]
163    fn test_strip_html_tags() {
164        let html = "<html><body><p>Hello <b>World</b></p></body></html>";
165        let text = strip_html_tags(html);
166        assert!(text.contains("Hello"));
167        assert!(text.contains("World"));
168        assert!(!text.contains("<"));
169    }
170
171    #[test]
172    fn test_web_client_creation() {
173        let _client = WebClient::new();
174        assert!(true); // Just verify it can be created
175    }
176}