anno/ingest/
url_resolver.rs

1//! URL resolution connectors for fetching content from URLs.
2//!
3//! Provides a trait-based system for resolving different URL types to text content.
4
5use crate::Result;
6use std::collections::HashMap;
7
8/// Resolved content from a URL.
9#[derive(Debug, Clone)]
10pub struct ResolvedContent {
11    /// The extracted text content
12    pub text: String,
13    /// Metadata about the source (title, content-type, etc.)
14    pub metadata: HashMap<String, String>,
15    /// The original URL
16    pub source_url: String,
17}
18
19/// Trait for URL resolvers that can fetch and extract text from URLs.
20pub trait UrlResolver: std::fmt::Debug {
21    /// Check if this resolver can handle the given URL.
22    fn can_resolve(&self, url: &str) -> bool;
23
24    /// Resolve the URL to text content.
25    fn resolve(&self, url: &str) -> Result<ResolvedContent>;
26}
27
28/// HTTP/HTTPS URL resolver.
29///
30/// Fetches content from HTTP/HTTPS URLs and extracts text from HTML if needed.
31#[derive(Debug, Default)]
32pub struct HttpResolver;
33
34impl HttpResolver {
35    /// Create a new HTTP resolver.
36    #[must_use]
37    pub fn new() -> Self {
38        Self
39    }
40
41    /// Extract text from HTML content (simple, no full HTML parser).
42    ///
43    /// Removes HTML tags and decodes common entities.
44    #[allow(dead_code)] // Part of trait interface, may be unused in some feature combinations
45    fn extract_text_from_html(&self, html: &str) -> String {
46        let mut text = String::with_capacity(html.len());
47        let mut in_tag = false;
48        let mut in_script = false;
49        let mut in_style = false;
50        let mut chars = html.chars().peekable();
51
52        while let Some(ch) = chars.next() {
53            match ch {
54                '<' => {
55                    in_tag = true;
56                    // Check for script/style tags
57                    let mut tag_buffer = String::new();
58                    tag_buffer.push('<');
59                    let mut tag_name = String::new();
60                    let mut in_tag_name = true;
61
62                    while let Some(&next_ch) = chars.peek() {
63                        if next_ch == '>' {
64                            chars.next();
65                            tag_buffer.push('>');
66                            let tag_lower = tag_name.to_lowercase();
67                            if tag_lower == "script" || tag_lower.starts_with("script ") {
68                                in_script = true;
69                            } else if tag_lower == "/script" || tag_lower.starts_with("/script ") {
70                                in_script = false;
71                            } else if tag_lower == "style" || tag_lower.starts_with("style ") {
72                                in_style = true;
73                            } else if tag_lower == "/style" || tag_lower.starts_with("/style ") {
74                                in_style = false;
75                            }
76                            in_tag = false;
77                            break;
78                        } else if next_ch.is_whitespace() {
79                            in_tag_name = false;
80                            tag_buffer.push(
81                                chars
82                                    .next()
83                                    .expect("chars.peek() returned Some, so next() should be Some"),
84                            );
85                        } else if in_tag_name {
86                            tag_name.push(
87                                chars
88                                    .next()
89                                    .expect("chars.peek() returned Some, so next() should be Some"),
90                            );
91                        } else {
92                            tag_buffer.push(
93                                chars
94                                    .next()
95                                    .expect("chars.peek() returned Some, so next() should be Some"),
96                            );
97                        }
98                    }
99                    // Don't add script/style content
100                    if !in_script && !in_style {
101                        // Add space after block elements for readability
102                        if matches!(
103                            tag_name.to_lowercase().as_str(),
104                            "p" | "div" | "br" | "li" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6"
105                        ) && !text.ends_with(' ')
106                            && !text.is_empty()
107                        {
108                            text.push(' ');
109                        }
110                    }
111                }
112                '>' if in_tag => {
113                    in_tag = false;
114                }
115                _ if in_tag || in_script || in_style => {
116                    // Skip content inside tags, scripts, styles
117                }
118                '&' => {
119                    // Decode common HTML entities
120                    let mut entity = String::new();
121                    entity.push('&');
122                    let mut found_semicolon = false;
123                    while let Some(&next_ch) = chars.peek() {
124                        entity.push(
125                            chars
126                                .next()
127                                .expect("chars.peek() returned Some, so next() should be Some"),
128                        );
129                        if next_ch == ';' {
130                            found_semicolon = true;
131                            break;
132                        }
133                        if next_ch.is_whitespace() || next_ch == '<' {
134                            break;
135                        }
136                    }
137
138                    if found_semicolon {
139                        let decoded = match entity.as_str() {
140                            "&amp;" => "&",
141                            "&lt;" => "<",
142                            "&gt;" => ">",
143                            "&quot;" => "\"",
144                            "&apos;" => "'",
145                            "&nbsp;" => " ",
146                            "&#39;" => "'",
147                            "&#8217;" => "'",
148                            "&#8220;" => "\"",
149                            "&#8221;" => "\"",
150                            _ => {
151                                // Try numeric entity
152                                if entity.starts_with("&#") && entity.len() > 2 {
153                                    let num_str = &entity[2..entity.len() - 1];
154                                    if let Ok(num) = num_str.parse::<u32>() {
155                                        if let Some(ch) = char::from_u32(num) {
156                                            text.push(ch);
157                                            continue;
158                                        }
159                                    }
160                                }
161                                // Unknown entity, keep as-is
162                                text.push_str(&entity);
163                                continue;
164                            }
165                        };
166                        text.push_str(decoded);
167                    } else {
168                        // Not a valid entity, keep as-is
169                        text.push('&');
170                        text.push_str(&entity[1..]);
171                    }
172                }
173                ch if !in_tag && !in_script && !in_style => {
174                    text.push(ch);
175                }
176                _ => {}
177            }
178        }
179
180        // Clean up whitespace.
181        //
182        // HTML whitespace semantics are "collapsed": runs of whitespace render as a single space
183        // (outside of <pre>, which we don't handle here). If we preserve raw newlines/indentation
184        // from the HTML source, we end up with spans whose (start,end) point into `doc.text`,
185        // but whose extracted `surface` has spaces (many NER backends reconstruct surfaces by
186        // joining tokens with spaces). That mismatch creates a lot of validation noise on real
187        // pages and makes debug output harder to trust.
188        //
189        // So: collapse ALL whitespace to single spaces and trim.
190        let mut cleaned = String::with_capacity(text.len());
191        let mut last_was_space = true; // avoid leading spaces
192        for ch in text.chars() {
193            if ch.is_whitespace() {
194                if !last_was_space {
195                    cleaned.push(' ');
196                    last_was_space = true;
197                }
198            } else {
199                cleaned.push(ch);
200                last_was_space = false;
201            }
202        }
203        cleaned.trim().to_string()
204    }
205}
206
207impl UrlResolver for HttpResolver {
208    fn can_resolve(&self, url: &str) -> bool {
209        url.starts_with("http://") || url.starts_with("https://")
210    }
211
212    fn resolve(&self, url: &str) -> Result<ResolvedContent> {
213        #[cfg(feature = "eval")]
214        {
215            let _url = url; // Used in error messages below
216                            // Reuse the download infrastructure from eval/loader
217                            // Note: download_attempt is private, so we'll implement our own
218            let response = ureq::get(url)
219                .timeout(std::time::Duration::from_secs(60))
220                .call()
221                .map_err(|e| {
222                    let error_msg = format!("{}", e);
223                    crate::Error::InvalidInput(format!(
224                        "Network error fetching {}: {}. \
225                         Check your internet connection and try again.",
226                        url, error_msg
227                    ))
228                })?;
229
230            if response.status() != 200 {
231                return Err(crate::Error::InvalidInput(format!(
232                    "HTTP {} fetching {}. \
233                     Server returned error status. \
234                     URL may be temporarily unavailable or changed.",
235                    response.status(),
236                    url
237                )));
238            }
239
240            let content = response.into_string().map_err(|e| {
241                crate::Error::InvalidInput(format!(
242                    "Failed to read response from {}: {}. \
243                     Response may be too large or corrupted.",
244                    url, e
245                ))
246            })?;
247
248            let mut metadata = HashMap::new();
249            metadata.insert("content-type".to_string(), "text/html".to_string());
250            metadata.insert("source".to_string(), "http".to_string());
251
252            // Check if content looks like HTML
253            let text = if content.trim_start().starts_with('<') {
254                // HTML content - extract text
255                metadata.insert("content-type".to_string(), "text/html".to_string());
256                self.extract_text_from_html(&content)
257            } else {
258                // Plain text
259                metadata.insert("content-type".to_string(), "text/plain".to_string());
260                content
261            };
262
263            Ok(ResolvedContent {
264                text,
265                metadata,
266                source_url: url.to_string(),
267            })
268        }
269
270        #[cfg(not(feature = "eval"))]
271        {
272            #[allow(unused_variables)]
273            let _url = url;
274            Err(crate::Error::InvalidInput(
275                "URL resolution requires 'eval' feature. \
276                 Enable it with: cargo build -p anno-cli --features eval"
277                    .to_string(),
278            ))
279        }
280    }
281}
282
283/// Composite resolver that tries multiple resolvers in order.
284pub struct CompositeResolver {
285    resolvers: Vec<Box<dyn UrlResolver>>,
286}
287
288impl std::fmt::Debug for CompositeResolver {
289    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
290        f.debug_struct("CompositeResolver")
291            .field("resolver_count", &self.resolvers.len())
292            .finish()
293    }
294}
295
296impl CompositeResolver {
297    /// Create a new composite resolver with default resolvers.
298    #[must_use]
299    pub fn new() -> Self {
300        let resolvers = vec![Box::new(HttpResolver::new()) as Box<dyn UrlResolver>];
301        Self { resolvers }
302    }
303
304    /// Add a resolver to the chain.
305    pub fn add_resolver(&mut self, resolver: Box<dyn UrlResolver>) {
306        self.resolvers.push(resolver);
307    }
308}
309
310impl Default for CompositeResolver {
311    fn default() -> Self {
312        Self::new()
313    }
314}
315
316impl UrlResolver for CompositeResolver {
317    fn can_resolve(&self, url: &str) -> bool {
318        self.resolvers.iter().any(|r| r.can_resolve(url))
319    }
320
321    fn resolve(&self, url: &str) -> Result<ResolvedContent> {
322        for resolver in &self.resolvers {
323            if resolver.can_resolve(url) {
324                return resolver.resolve(url);
325            }
326        }
327        Err(crate::Error::InvalidInput(format!(
328            "No resolver available for URL: {}",
329            url
330        )))
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    #[test]
339    fn test_http_resolver_can_resolve_http() {
340        let resolver = HttpResolver::new();
341        assert!(resolver.can_resolve("http://example.com"));
342        assert!(resolver.can_resolve("https://example.com"));
343        assert!(resolver.can_resolve("http://example.com/path?query=1"));
344        assert!(resolver.can_resolve("https://subdomain.example.com/path"));
345    }
346
347    #[test]
348    fn test_http_resolver_case_sensitive() {
349        // Note: Implementation is case-sensitive (lowercase only)
350        let resolver = HttpResolver::new();
351        assert!(!resolver.can_resolve("HTTP://example.com"));
352        assert!(!resolver.can_resolve("HTTPS://example.com"));
353    }
354
355    #[test]
356    fn test_http_resolver_cannot_resolve_other_schemes() {
357        let resolver = HttpResolver::new();
358        assert!(!resolver.can_resolve("ftp://example.com"));
359        assert!(!resolver.can_resolve("file:///path/to/file"));
360        assert!(!resolver.can_resolve("mailto:test@example.com"));
361        assert!(!resolver.can_resolve("not_a_url"));
362    }
363
364    #[test]
365    fn test_resolved_content_struct() {
366        let content = ResolvedContent {
367            text: "Hello world".to_string(),
368            metadata: HashMap::new(),
369            source_url: "https://example.com".to_string(),
370        };
371
372        assert_eq!(content.text, "Hello world");
373        assert!(content.metadata.is_empty());
374        assert_eq!(content.source_url, "https://example.com");
375    }
376
377    #[test]
378    fn test_resolved_content_with_metadata() {
379        let mut metadata = HashMap::new();
380        metadata.insert("content-type".to_string(), "text/html".to_string());
381
382        let content = ResolvedContent {
383            text: "Test".to_string(),
384            metadata,
385            source_url: "https://test.com".to_string(),
386        };
387
388        assert_eq!(
389            content.metadata.get("content-type"),
390            Some(&"text/html".to_string())
391        );
392    }
393
394    #[test]
395    fn test_composite_resolver_creation() {
396        let resolver = CompositeResolver::new();
397        assert!(resolver.can_resolve("https://example.com"));
398    }
399
400    #[test]
401    fn test_composite_resolver_default() {
402        let resolver = CompositeResolver::default();
403        // Should have at least one resolver (HttpResolver)
404        assert!(resolver.can_resolve("http://example.com"));
405    }
406
407    #[test]
408    fn test_composite_resolver_cannot_resolve_unknown() {
409        let resolver = CompositeResolver::new();
410        assert!(!resolver.can_resolve("custom://unknown"));
411    }
412
413    #[test]
414    fn test_composite_resolver_debug() {
415        let resolver = CompositeResolver::new();
416        let debug = format!("{:?}", resolver);
417        assert!(debug.contains("CompositeResolver"));
418        assert!(debug.contains("resolver_count"));
419    }
420
421    #[test]
422    fn test_http_resolver_debug() {
423        let resolver = HttpResolver::new();
424        let debug = format!("{:?}", resolver);
425        assert!(debug.contains("HttpResolver"));
426    }
427
428    #[test]
429    fn test_extract_text_from_html_collapses_whitespace() {
430        let resolver = HttpResolver::new();
431        let html = r#"
432            <html>
433              <head><title>t</title></head>
434              <body>
435                <h1>Hello
436                    world</h1>
437                <p>Line1<br>Line2</p>
438                <div>Tabbed	text</div>
439                <p>習近平在北京會見了普京。</p>
440                <p>التقى محمد بن سلمان بالرئيس في الرياض</p>
441                <p>Путин встретился с Си Цзиньпином в Москве.</p>
442                <p>प्रधान मंत्री शर्मा आज आए।</p>
443              </body>
444            </html>
445        "#;
446
447        let text = resolver.extract_text_from_html(html);
448        assert!(text.contains("Hello world"));
449        assert!(text.contains("Line1 Line2"));
450        assert!(text.contains("Tabbed text"));
451        // Multilingual smoke: make sure we don't drop/garble non-Latin scripts.
452        assert!(text.contains("習近平在北京會見了普京。"));
453        assert!(text.contains("التقى محمد بن سلمان بالرئيس في الرياض"));
454        assert!(text.contains("Путин встретился с Си Цзиньпином в Москве."));
455        assert!(text.contains("प्रधान मंत्री शर्मा आज आए।"));
456
457        // No raw newlines/tabs from HTML formatting should surcerno.
458        assert!(!text.contains('\n'));
459        assert!(!text.contains('\t'));
460
461        // No double spaces (collapsed).
462        assert!(!text.contains("  "));
463    }
464
465    #[test]
466    fn test_resolved_content_clone() {
467        let mut metadata = HashMap::new();
468        metadata.insert("key".to_string(), "value".to_string());
469
470        let content = ResolvedContent {
471            text: "test".to_string(),
472            metadata,
473            source_url: "http://test.com".to_string(),
474        };
475
476        let cloned = content.clone();
477        assert_eq!(content.text, cloned.text);
478        assert_eq!(content.source_url, cloned.source_url);
479        assert_eq!(content.metadata, cloned.metadata);
480    }
481
482    #[test]
483    #[cfg(not(feature = "eval"))]
484    fn test_http_resolver_without_feature() {
485        let resolver = HttpResolver::new();
486        let result = resolver.resolve("https://example.com");
487        // Without eval feature, should return an error
488        assert!(result.is_err());
489        let err = result.unwrap_err();
490        assert!(err.to_string().contains("eval"));
491    }
492
493    #[test]
494    fn test_composite_resolver_no_matching_resolver() {
495        let resolver = CompositeResolver { resolvers: vec![] };
496        let result = resolver.resolve("any://url");
497        assert!(result.is_err());
498        assert!(result
499            .unwrap_err()
500            .to_string()
501            .contains("No resolver available"));
502    }
503}
anno/ingest/url_resolver.rs

anno/ingest/
url_resolver.rs