anno/ingest/
url_resolver.rs1use crate::Result;
6use std::collections::HashMap;
7
8#[derive(Debug, Clone)]
10pub struct ResolvedContent {
11 pub text: String,
13 pub metadata: HashMap<String, String>,
15 pub source_url: String,
17}
18
19pub trait UrlResolver: std::fmt::Debug {
21 fn can_resolve(&self, url: &str) -> bool;
23
24 fn resolve(&self, url: &str) -> Result<ResolvedContent>;
26}
27
28#[derive(Debug, Default)]
32pub struct HttpResolver;
33
34impl HttpResolver {
35 #[must_use]
37 pub fn new() -> Self {
38 Self
39 }
40
41 #[allow(dead_code)] fn extract_text_from_html(&self, html: &str) -> String {
46 let mut text = String::with_capacity(html.len());
47 let mut in_tag = false;
48 let mut in_script = false;
49 let mut in_style = false;
50 let mut chars = html.chars().peekable();
51
52 while let Some(ch) = chars.next() {
53 match ch {
54 '<' => {
55 in_tag = true;
56 let mut tag_buffer = String::new();
58 tag_buffer.push('<');
59 let mut tag_name = String::new();
60 let mut in_tag_name = true;
61
62 while let Some(&next_ch) = chars.peek() {
63 if next_ch == '>' {
64 chars.next();
65 tag_buffer.push('>');
66 let tag_lower = tag_name.to_lowercase();
67 if tag_lower == "script" || tag_lower.starts_with("script ") {
68 in_script = true;
69 } else if tag_lower == "/script" || tag_lower.starts_with("/script ") {
70 in_script = false;
71 } else if tag_lower == "style" || tag_lower.starts_with("style ") {
72 in_style = true;
73 } else if tag_lower == "/style" || tag_lower.starts_with("/style ") {
74 in_style = false;
75 }
76 in_tag = false;
77 break;
78 } else if next_ch.is_whitespace() {
79 in_tag_name = false;
80 tag_buffer.push(
81 chars
82 .next()
83 .expect("chars.peek() returned Some, so next() should be Some"),
84 );
85 } else if in_tag_name {
86 tag_name.push(
87 chars
88 .next()
89 .expect("chars.peek() returned Some, so next() should be Some"),
90 );
91 } else {
92 tag_buffer.push(
93 chars
94 .next()
95 .expect("chars.peek() returned Some, so next() should be Some"),
96 );
97 }
98 }
99 if !in_script && !in_style {
101 if matches!(
103 tag_name.to_lowercase().as_str(),
104 "p" | "div" | "br" | "li" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6"
105 ) && !text.ends_with(' ')
106 && !text.is_empty()
107 {
108 text.push(' ');
109 }
110 }
111 }
112 '>' if in_tag => {
113 in_tag = false;
114 }
115 _ if in_tag || in_script || in_style => {
116 }
118 '&' => {
119 let mut entity = String::new();
121 entity.push('&');
122 let mut found_semicolon = false;
123 while let Some(&next_ch) = chars.peek() {
124 entity.push(
125 chars
126 .next()
127 .expect("chars.peek() returned Some, so next() should be Some"),
128 );
129 if next_ch == ';' {
130 found_semicolon = true;
131 break;
132 }
133 if next_ch.is_whitespace() || next_ch == '<' {
134 break;
135 }
136 }
137
138 if found_semicolon {
139 let decoded = match entity.as_str() {
140 "&" => "&",
141 "<" => "<",
142 ">" => ">",
143 """ => "\"",
144 "'" => "'",
145 " " => " ",
146 "'" => "'",
147 "’" => "'",
148 "“" => "\"",
149 "”" => "\"",
150 _ => {
151 if entity.starts_with("&#") && entity.len() > 2 {
153 let num_str = &entity[2..entity.len() - 1];
154 if let Ok(num) = num_str.parse::<u32>() {
155 if let Some(ch) = char::from_u32(num) {
156 text.push(ch);
157 continue;
158 }
159 }
160 }
161 text.push_str(&entity);
163 continue;
164 }
165 };
166 text.push_str(decoded);
167 } else {
168 text.push('&');
170 text.push_str(&entity[1..]);
171 }
172 }
173 ch if !in_tag && !in_script && !in_style => {
174 text.push(ch);
175 }
176 _ => {}
177 }
178 }
179
180 let mut cleaned = String::with_capacity(text.len());
191 let mut last_was_space = true; for ch in text.chars() {
193 if ch.is_whitespace() {
194 if !last_was_space {
195 cleaned.push(' ');
196 last_was_space = true;
197 }
198 } else {
199 cleaned.push(ch);
200 last_was_space = false;
201 }
202 }
203 cleaned.trim().to_string()
204 }
205}
206
207impl UrlResolver for HttpResolver {
208 fn can_resolve(&self, url: &str) -> bool {
209 url.starts_with("http://") || url.starts_with("https://")
210 }
211
212 fn resolve(&self, url: &str) -> Result<ResolvedContent> {
213 #[cfg(feature = "eval")]
214 {
215 let _url = url; let response = ureq::get(url)
219 .timeout(std::time::Duration::from_secs(60))
220 .call()
221 .map_err(|e| {
222 let error_msg = format!("{}", e);
223 crate::Error::InvalidInput(format!(
224 "Network error fetching {}: {}. \
225 Check your internet connection and try again.",
226 url, error_msg
227 ))
228 })?;
229
230 if response.status() != 200 {
231 return Err(crate::Error::InvalidInput(format!(
232 "HTTP {} fetching {}. \
233 Server returned error status. \
234 URL may be temporarily unavailable or changed.",
235 response.status(),
236 url
237 )));
238 }
239
240 let content = response.into_string().map_err(|e| {
241 crate::Error::InvalidInput(format!(
242 "Failed to read response from {}: {}. \
243 Response may be too large or corrupted.",
244 url, e
245 ))
246 })?;
247
248 let mut metadata = HashMap::new();
249 metadata.insert("content-type".to_string(), "text/html".to_string());
250 metadata.insert("source".to_string(), "http".to_string());
251
252 let text = if content.trim_start().starts_with('<') {
254 metadata.insert("content-type".to_string(), "text/html".to_string());
256 self.extract_text_from_html(&content)
257 } else {
258 metadata.insert("content-type".to_string(), "text/plain".to_string());
260 content
261 };
262
263 Ok(ResolvedContent {
264 text,
265 metadata,
266 source_url: url.to_string(),
267 })
268 }
269
270 #[cfg(not(feature = "eval"))]
271 {
272 #[allow(unused_variables)]
273 let _url = url;
274 Err(crate::Error::InvalidInput(
275 "URL resolution requires 'eval' feature. \
276 Enable it with: cargo build -p anno-cli --features eval"
277 .to_string(),
278 ))
279 }
280 }
281}
282
283pub struct CompositeResolver {
285 resolvers: Vec<Box<dyn UrlResolver>>,
286}
287
288impl std::fmt::Debug for CompositeResolver {
289 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
290 f.debug_struct("CompositeResolver")
291 .field("resolver_count", &self.resolvers.len())
292 .finish()
293 }
294}
295
296impl CompositeResolver {
297 #[must_use]
299 pub fn new() -> Self {
300 let resolvers = vec![Box::new(HttpResolver::new()) as Box<dyn UrlResolver>];
301 Self { resolvers }
302 }
303
304 pub fn add_resolver(&mut self, resolver: Box<dyn UrlResolver>) {
306 self.resolvers.push(resolver);
307 }
308}
309
310impl Default for CompositeResolver {
311 fn default() -> Self {
312 Self::new()
313 }
314}
315
316impl UrlResolver for CompositeResolver {
317 fn can_resolve(&self, url: &str) -> bool {
318 self.resolvers.iter().any(|r| r.can_resolve(url))
319 }
320
321 fn resolve(&self, url: &str) -> Result<ResolvedContent> {
322 for resolver in &self.resolvers {
323 if resolver.can_resolve(url) {
324 return resolver.resolve(url);
325 }
326 }
327 Err(crate::Error::InvalidInput(format!(
328 "No resolver available for URL: {}",
329 url
330 )))
331 }
332}
333
334#[cfg(test)]
335mod tests {
336 use super::*;
337
338 #[test]
339 fn test_http_resolver_can_resolve_http() {
340 let resolver = HttpResolver::new();
341 assert!(resolver.can_resolve("http://example.com"));
342 assert!(resolver.can_resolve("https://example.com"));
343 assert!(resolver.can_resolve("http://example.com/path?query=1"));
344 assert!(resolver.can_resolve("https://subdomain.example.com/path"));
345 }
346
347 #[test]
348 fn test_http_resolver_case_sensitive() {
349 let resolver = HttpResolver::new();
351 assert!(!resolver.can_resolve("HTTP://example.com"));
352 assert!(!resolver.can_resolve("HTTPS://example.com"));
353 }
354
355 #[test]
356 fn test_http_resolver_cannot_resolve_other_schemes() {
357 let resolver = HttpResolver::new();
358 assert!(!resolver.can_resolve("ftp://example.com"));
359 assert!(!resolver.can_resolve("file:///path/to/file"));
360 assert!(!resolver.can_resolve("mailto:test@example.com"));
361 assert!(!resolver.can_resolve("not_a_url"));
362 }
363
364 #[test]
365 fn test_resolved_content_struct() {
366 let content = ResolvedContent {
367 text: "Hello world".to_string(),
368 metadata: HashMap::new(),
369 source_url: "https://example.com".to_string(),
370 };
371
372 assert_eq!(content.text, "Hello world");
373 assert!(content.metadata.is_empty());
374 assert_eq!(content.source_url, "https://example.com");
375 }
376
377 #[test]
378 fn test_resolved_content_with_metadata() {
379 let mut metadata = HashMap::new();
380 metadata.insert("content-type".to_string(), "text/html".to_string());
381
382 let content = ResolvedContent {
383 text: "Test".to_string(),
384 metadata,
385 source_url: "https://test.com".to_string(),
386 };
387
388 assert_eq!(
389 content.metadata.get("content-type"),
390 Some(&"text/html".to_string())
391 );
392 }
393
394 #[test]
395 fn test_composite_resolver_creation() {
396 let resolver = CompositeResolver::new();
397 assert!(resolver.can_resolve("https://example.com"));
398 }
399
400 #[test]
401 fn test_composite_resolver_default() {
402 let resolver = CompositeResolver::default();
403 assert!(resolver.can_resolve("http://example.com"));
405 }
406
407 #[test]
408 fn test_composite_resolver_cannot_resolve_unknown() {
409 let resolver = CompositeResolver::new();
410 assert!(!resolver.can_resolve("custom://unknown"));
411 }
412
413 #[test]
414 fn test_composite_resolver_debug() {
415 let resolver = CompositeResolver::new();
416 let debug = format!("{:?}", resolver);
417 assert!(debug.contains("CompositeResolver"));
418 assert!(debug.contains("resolver_count"));
419 }
420
421 #[test]
422 fn test_http_resolver_debug() {
423 let resolver = HttpResolver::new();
424 let debug = format!("{:?}", resolver);
425 assert!(debug.contains("HttpResolver"));
426 }
427
428 #[test]
429 fn test_extract_text_from_html_collapses_whitespace() {
430 let resolver = HttpResolver::new();
431 let html = r#"
432 <html>
433 <head><title>t</title></head>
434 <body>
435 <h1>Hello
436 world</h1>
437 <p>Line1<br>Line2</p>
438 <div>Tabbed text</div>
439 <p>習近平在北京會見了普京。</p>
440 <p>التقى محمد بن سلمان بالرئيس في الرياض</p>
441 <p>Путин встретился с Си Цзиньпином в Москве.</p>
442 <p>प्रधान मंत्री शर्मा आज आए।</p>
443 </body>
444 </html>
445 "#;
446
447 let text = resolver.extract_text_from_html(html);
448 assert!(text.contains("Hello world"));
449 assert!(text.contains("Line1 Line2"));
450 assert!(text.contains("Tabbed text"));
451 assert!(text.contains("習近平在北京會見了普京。"));
453 assert!(text.contains("التقى محمد بن سلمان بالرئيس في الرياض"));
454 assert!(text.contains("Путин встретился с Си Цзиньпином в Москве."));
455 assert!(text.contains("प्रधान मंत्री शर्मा आज आए।"));
456
457 assert!(!text.contains('\n'));
459 assert!(!text.contains('\t'));
460
461 assert!(!text.contains(" "));
463 }
464
465 #[test]
466 fn test_resolved_content_clone() {
467 let mut metadata = HashMap::new();
468 metadata.insert("key".to_string(), "value".to_string());
469
470 let content = ResolvedContent {
471 text: "test".to_string(),
472 metadata,
473 source_url: "http://test.com".to_string(),
474 };
475
476 let cloned = content.clone();
477 assert_eq!(content.text, cloned.text);
478 assert_eq!(content.source_url, cloned.source_url);
479 assert_eq!(content.metadata, cloned.metadata);
480 }
481
482 #[test]
483 #[cfg(not(feature = "eval"))]
484 fn test_http_resolver_without_feature() {
485 let resolver = HttpResolver::new();
486 let result = resolver.resolve("https://example.com");
487 assert!(result.is_err());
489 let err = result.unwrap_err();
490 assert!(err.to_string().contains("eval"));
491 }
492
493 #[test]
494 fn test_composite_resolver_no_matching_resolver() {
495 let resolver = CompositeResolver { resolvers: vec![] };
496 let result = resolver.resolve("any://url");
497 assert!(result.is_err());
498 assert!(result
499 .unwrap_err()
500 .to_string()
501 .contains("No resolver available"));
502 }
503}