agentroot_core/providers/
url.rs

1//! URL Provider for fetching content from web pages
2
3use crate::db::hash_content;
4use crate::error::{AgentRootError, Result};
5use crate::providers::{ProviderConfig, SourceItem, SourceProvider};
6use async_trait::async_trait;
7use reqwest::{Client, StatusCode};
8use std::time::Duration;
9
10/// Provider for fetching content from URLs
11pub struct URLProvider {
12    client: Client,
13}
14
15impl Default for URLProvider {
16    fn default() -> Self {
17        Self::new()
18    }
19}
20
21impl URLProvider {
22    /// Create a new URLProvider with default settings
23    pub fn new() -> Self {
24        let client = Client::builder()
25            .user_agent(concat!("agentroot/", env!("CARGO_PKG_VERSION")))
26            .timeout(Duration::from_secs(30))
27            .redirect(reqwest::redirect::Policy::limited(10))
28            .build()
29            .unwrap_or_else(|_| Client::new());
30        Self { client }
31    }
32
33    /// Create a URLProvider with custom client
34    pub fn with_client(client: Client) -> Self {
35        Self { client }
36    }
37
38    /// Fetch content from a URL with proper error handling
39    async fn fetch_url(&self, url: &str) -> Result<String> {
40        let response = self.client.get(url).send().await.map_err(|e| {
41            if e.is_timeout() {
42                AgentRootError::ExternalError(format!(
43                    "Request timeout fetching {}: Server took too long to respond.",
44                    url
45                ))
46            } else if e.is_connect() {
47                AgentRootError::ExternalError(format!(
48                    "Connection error fetching {}: Cannot reach server. Check your internet connection.",
49                    url
50                ))
51            } else {
52                AgentRootError::ExternalError(format!("Failed to fetch URL {}: {}", url, e))
53            }
54        })?;
55
56        let status = response.status();
57        if !status.is_success() {
58            let error_msg = match status {
59                StatusCode::NOT_FOUND => format!("URL not found (404): {}", url),
60                StatusCode::FORBIDDEN => {
61                    format!(
62                        "Access forbidden (403): {}. Authentication may be required.",
63                        url
64                    )
65                }
66                StatusCode::UNAUTHORIZED => {
67                    format!(
68                        "Unauthorized (401): {}. Valid credentials are required.",
69                        url
70                    )
71                }
72                StatusCode::TOO_MANY_REQUESTS => {
73                    format!("Rate limit exceeded (429): {}. Try again later.", url)
74                }
75                s if s.is_server_error() => {
76                    format!(
77                        "Server error ({}): {}. The server is experiencing issues.",
78                        s.as_u16(),
79                        url
80                    )
81                }
82                _ => format!(
83                    "HTTP error {}: {}",
84                    status.as_u16(),
85                    status.canonical_reason().unwrap_or("Unknown error")
86                ),
87            };
88            return Err(AgentRootError::ExternalError(error_msg));
89        }
90
91        response.text().await.map_err(|e| {
92            AgentRootError::ExternalError(format!(
93                "Failed to read response body from {}: {}",
94                url, e
95            ))
96        })
97    }
98
99    /// Extract title from content (looks for markdown # header or HTML title)
100    fn extract_title(&self, content: &str, url: &str) -> String {
101        if let Some(title) = content.lines().find(|line| line.trim().starts_with("# ")) {
102            return title.trim_start_matches("# ").trim().to_string();
103        }
104
105        if let Some(start) = content.find("<title>") {
106            if let Some(end) = content[start..].find("</title>") {
107                let title = &content[start + 7..start + end];
108                return title.trim().to_string();
109            }
110        }
111
112        url.split('/')
113            .filter(|s| !s.is_empty())
114            .next_back()
115            .map(|s| s.to_string())
116            .unwrap_or_else(|| "Untitled".to_string())
117    }
118}
119
120#[async_trait]
121impl SourceProvider for URLProvider {
122    fn provider_type(&self) -> &'static str {
123        "url"
124    }
125
126    async fn list_items(&self, config: &ProviderConfig) -> Result<Vec<SourceItem>> {
127        let item = self.fetch_item(&config.base_path).await?;
128        Ok(vec![item])
129    }
130
131    async fn fetch_item(&self, uri: &str) -> Result<SourceItem> {
132        let content = self.fetch_url(uri).await?;
133        let title = self.extract_title(&content, uri);
134        let hash = hash_content(&content);
135
136        let mut item = SourceItem::new(uri.to_string(), title, content, hash, "url".to_string());
137        item.metadata.insert("url".to_string(), uri.to_string());
138
139        Ok(item)
140    }
141}
142
143#[cfg(test)]
144mod tests {
145    use super::*;
146
147    #[test]
148    fn test_provider_type() {
149        let provider = URLProvider::new();
150        assert_eq!(provider.provider_type(), "url");
151    }
152
153    #[test]
154    fn test_extract_title_from_markdown() {
155        let provider = URLProvider::new();
156        let content = "# Hello World\n\nSome content";
157        let title = provider.extract_title(content, "https://example.com/test.md");
158        assert_eq!(title, "Hello World");
159    }
160
161    #[test]
162    fn test_extract_title_from_html() {
163        let provider = URLProvider::new();
164        let content = "<html><head><title>Test Page</title></head><body>Content</body></html>";
165        let title = provider.extract_title(content, "https://example.com/test.html");
166        assert_eq!(title, "Test Page");
167    }
168
169    #[test]
170    fn test_extract_title_from_url() {
171        let provider = URLProvider::new();
172        let content = "Just some text";
173        let title = provider.extract_title(content, "https://example.com/my-document.txt");
174        assert_eq!(title, "my-document.txt");
175    }
176
177    #[test]
178    fn test_extract_title_fallback() {
179        let provider = URLProvider::new();
180        let content = "Just some text";
181        let title = provider.extract_title(content, "https://example.com/");
182        assert_eq!(title, "example.com");
183    }
184
185    #[tokio::test]
186    async fn test_fetch_invalid_url() {
187        let provider = URLProvider::new();
188        let result = provider
189            .fetch_url("http://thisurldoesnotexist12345.invalid")
190            .await;
191        assert!(result.is_err());
192    }
193}