agentroot_core/providers/
url.rs1use crate::db::hash_content;
4use crate::error::{AgentRootError, Result};
5use crate::providers::{ProviderConfig, SourceItem, SourceProvider};
6use async_trait::async_trait;
7use reqwest::{Client, StatusCode};
8use std::time::Duration;
9
10pub struct URLProvider {
12 client: Client,
13}
14
15impl Default for URLProvider {
16 fn default() -> Self {
17 Self::new()
18 }
19}
20
21impl URLProvider {
22 pub fn new() -> Self {
24 let client = Client::builder()
25 .user_agent(concat!("agentroot/", env!("CARGO_PKG_VERSION")))
26 .timeout(Duration::from_secs(30))
27 .redirect(reqwest::redirect::Policy::limited(10))
28 .build()
29 .unwrap_or_else(|_| Client::new());
30 Self { client }
31 }
32
33 pub fn with_client(client: Client) -> Self {
35 Self { client }
36 }
37
38 async fn fetch_url(&self, url: &str) -> Result<String> {
40 let response = self.client.get(url).send().await.map_err(|e| {
41 if e.is_timeout() {
42 AgentRootError::ExternalError(format!(
43 "Request timeout fetching {}: Server took too long to respond.",
44 url
45 ))
46 } else if e.is_connect() {
47 AgentRootError::ExternalError(format!(
48 "Connection error fetching {}: Cannot reach server. Check your internet connection.",
49 url
50 ))
51 } else {
52 AgentRootError::ExternalError(format!("Failed to fetch URL {}: {}", url, e))
53 }
54 })?;
55
56 let status = response.status();
57 if !status.is_success() {
58 let error_msg = match status {
59 StatusCode::NOT_FOUND => format!("URL not found (404): {}", url),
60 StatusCode::FORBIDDEN => {
61 format!(
62 "Access forbidden (403): {}. Authentication may be required.",
63 url
64 )
65 }
66 StatusCode::UNAUTHORIZED => {
67 format!(
68 "Unauthorized (401): {}. Valid credentials are required.",
69 url
70 )
71 }
72 StatusCode::TOO_MANY_REQUESTS => {
73 format!("Rate limit exceeded (429): {}. Try again later.", url)
74 }
75 s if s.is_server_error() => {
76 format!(
77 "Server error ({}): {}. The server is experiencing issues.",
78 s.as_u16(),
79 url
80 )
81 }
82 _ => format!(
83 "HTTP error {}: {}",
84 status.as_u16(),
85 status.canonical_reason().unwrap_or("Unknown error")
86 ),
87 };
88 return Err(AgentRootError::ExternalError(error_msg));
89 }
90
91 response.text().await.map_err(|e| {
92 AgentRootError::ExternalError(format!(
93 "Failed to read response body from {}: {}",
94 url, e
95 ))
96 })
97 }
98
99 fn extract_title(&self, content: &str, url: &str) -> String {
101 if let Some(title) = content.lines().find(|line| line.trim().starts_with("# ")) {
102 return title.trim_start_matches("# ").trim().to_string();
103 }
104
105 if let Some(start) = content.find("<title>") {
106 if let Some(end) = content[start..].find("</title>") {
107 let title = &content[start + 7..start + end];
108 return title.trim().to_string();
109 }
110 }
111
112 url.split('/')
113 .filter(|s| !s.is_empty())
114 .next_back()
115 .map(|s| s.to_string())
116 .unwrap_or_else(|| "Untitled".to_string())
117 }
118}
119
120#[async_trait]
121impl SourceProvider for URLProvider {
122 fn provider_type(&self) -> &'static str {
123 "url"
124 }
125
126 async fn list_items(&self, config: &ProviderConfig) -> Result<Vec<SourceItem>> {
127 let item = self.fetch_item(&config.base_path).await?;
128 Ok(vec![item])
129 }
130
131 async fn fetch_item(&self, uri: &str) -> Result<SourceItem> {
132 let content = self.fetch_url(uri).await?;
133 let title = self.extract_title(&content, uri);
134 let hash = hash_content(&content);
135
136 let mut item = SourceItem::new(uri.to_string(), title, content, hash, "url".to_string());
137 item.metadata.insert("url".to_string(), uri.to_string());
138
139 Ok(item)
140 }
141}
142
143#[cfg(test)]
144mod tests {
145 use super::*;
146
147 #[test]
148 fn test_provider_type() {
149 let provider = URLProvider::new();
150 assert_eq!(provider.provider_type(), "url");
151 }
152
153 #[test]
154 fn test_extract_title_from_markdown() {
155 let provider = URLProvider::new();
156 let content = "# Hello World\n\nSome content";
157 let title = provider.extract_title(content, "https://example.com/test.md");
158 assert_eq!(title, "Hello World");
159 }
160
161 #[test]
162 fn test_extract_title_from_html() {
163 let provider = URLProvider::new();
164 let content = "<html><head><title>Test Page</title></head><body>Content</body></html>";
165 let title = provider.extract_title(content, "https://example.com/test.html");
166 assert_eq!(title, "Test Page");
167 }
168
169 #[test]
170 fn test_extract_title_from_url() {
171 let provider = URLProvider::new();
172 let content = "Just some text";
173 let title = provider.extract_title(content, "https://example.com/my-document.txt");
174 assert_eq!(title, "my-document.txt");
175 }
176
177 #[test]
178 fn test_extract_title_fallback() {
179 let provider = URLProvider::new();
180 let content = "Just some text";
181 let title = provider.extract_title(content, "https://example.com/");
182 assert_eq!(title, "example.com");
183 }
184
185 #[tokio::test]
186 async fn test_fetch_invalid_url() {
187 let provider = URLProvider::new();
188 let result = provider
189 .fetch_url("http://thisurldoesnotexist12345.invalid")
190 .await;
191 assert!(result.is_err());
192 }
193}