blz_core/
fetcher.rs

1use crate::{Error, Result};
2use base64::{Engine, engine::general_purpose::STANDARD};
3use reqwest::header::{CONTENT_LENGTH, ETAG, IF_MODIFIED_SINCE, IF_NONE_MATCH, LAST_MODIFIED};
4use reqwest::{Client, StatusCode};
5use sha2::{Digest, Sha256};
6use std::time::Duration;
7use tracing::{debug, info};
8
9/// HTTP client for fetching llms.txt documentation with conditional request support
10pub struct Fetcher {
11    client: Client,
12}
13
14impl Fetcher {
15    /// Creates a new fetcher with configured HTTP client
16    pub fn new() -> Result<Self> {
17        Self::with_timeout(Duration::from_secs(30))
18    }
19
20    /// Creates a new fetcher with a custom request timeout (primarily for tests)
21    pub fn with_timeout(timeout: Duration) -> Result<Self> {
22        let client = Client::builder()
23            .timeout(timeout)
24            .user_agent(concat!("outfitter-blz/", env!("CARGO_PKG_VERSION")))
25            .gzip(true)
26            .brotli(true)
27            .build()
28            .map_err(Error::Network)?;
29        Ok(Self { client })
30    }
31
32    /// Fetches a URL with conditional request support using `ETag` and `Last-Modified` headers
33    pub async fn fetch_with_cache(
34        &self,
35        url: &str,
36        etag: Option<&str>,
37        last_modified: Option<&str>,
38    ) -> Result<FetchResult> {
39        let mut request = self.client.get(url);
40
41        if let Some(tag) = etag {
42            debug!("Setting If-None-Match: {}", tag);
43            request = request.header(IF_NONE_MATCH, tag);
44        }
45
46        if let Some(lm) = last_modified {
47            debug!("Setting If-Modified-Since: {}", lm);
48            request = request.header(IF_MODIFIED_SINCE, lm);
49        }
50
51        let response = request.send().await?;
52        let status = response.status();
53
54        if status == StatusCode::NOT_MODIFIED {
55            info!("Resource not modified (304) for {}", url);
56
57            // Extract ETag and Last-Modified headers even on 304
58            let etag = response
59                .headers()
60                .get(ETAG)
61                .and_then(|v| v.to_str().ok())
62                .map(std::string::ToString::to_string);
63
64            let last_modified = response
65                .headers()
66                .get(LAST_MODIFIED)
67                .and_then(|v| v.to_str().ok())
68                .map(std::string::ToString::to_string);
69
70            return Ok(FetchResult::NotModified {
71                etag,
72                last_modified,
73            });
74        }
75
76        if !status.is_success() {
77            // Map 404 to a clearer NotFound error
78            if status == StatusCode::NOT_FOUND {
79                return Err(Error::NotFound(format!(
80                    "Resource not found at '{url}'. Check the URL or try 'blz lookup' to find available sources"
81                )));
82            }
83
84            // Try to get the actual error, or create one manually
85            match response.error_for_status() {
86                Ok(_) => unreachable!("Status should be an error"),
87                Err(err) => return Err(Error::Network(err)),
88            }
89        }
90
91        let new_etag = response
92            .headers()
93            .get(ETAG)
94            .and_then(|v| v.to_str().ok())
95            .map(std::string::ToString::to_string);
96
97        let new_last_modified = response
98            .headers()
99            .get(LAST_MODIFIED)
100            .and_then(|v| v.to_str().ok())
101            .map(std::string::ToString::to_string);
102
103        let content = response.text().await?;
104        let sha256 = calculate_sha256(&content);
105
106        info!("Fetched {} bytes from {}", content.len(), url);
107
108        Ok(FetchResult::Modified {
109            content,
110            etag: new_etag,
111            last_modified: new_last_modified,
112            sha256,
113        })
114    }
115
116    /// Fetches a URL without conditional request support, returning content and `SHA256` hash
117    pub async fn fetch(&self, url: &str) -> Result<(String, String)> {
118        let response = self.client.get(url).send().await?;
119        let status = response.status();
120
121        if !status.is_success() {
122            // Map 404 to a clearer NotFound error
123            if status == StatusCode::NOT_FOUND {
124                return Err(Error::NotFound(format!(
125                    "Resource not found at '{url}'. Check the URL or try 'blz lookup' to find available sources"
126                )));
127            }
128
129            // Try to get the actual error, or create one manually
130            match response.error_for_status() {
131                Ok(_) => unreachable!("Status should be an error"),
132                Err(err) => return Err(Error::Network(err)),
133            }
134        }
135
136        let content = response.text().await?;
137        let sha256 = calculate_sha256(&content);
138
139        Ok((content, sha256))
140    }
141
142    /// Perform a HEAD request to retrieve basic metadata for a URL without downloading content
143    pub async fn head_metadata(&self, url: &str) -> Result<HeadInfo> {
144        let response = self.client.head(url).send().await?;
145        let status = response.status();
146
147        let content_length = response
148            .headers()
149            .get(CONTENT_LENGTH)
150            .and_then(|v| v.to_str().ok())
151            .and_then(|s| s.parse::<u64>().ok());
152
153        let etag = response
154            .headers()
155            .get(ETAG)
156            .and_then(|v| v.to_str().ok())
157            .map(std::string::ToString::to_string);
158
159        let last_modified = response
160            .headers()
161            .get(LAST_MODIFIED)
162            .and_then(|v| v.to_str().ok())
163            .map(std::string::ToString::to_string);
164
165        Ok(HeadInfo {
166            status: status.as_u16(),
167            content_length,
168            etag,
169            last_modified,
170        })
171    }
172}
173
174/// Metadata from a HEAD request
175#[derive(Debug, Clone)]
176pub struct HeadInfo {
177    /// HTTP status code returned by the server (e.g., 200, 404)
178    pub status: u16,
179    /// Optional content length reported by the server via `Content-Length`
180    pub content_length: Option<u64>,
181    /// Optional entity tag returned by the server for cache validation
182    pub etag: Option<String>,
183    /// Optional last modified timestamp returned by the server
184    pub last_modified: Option<String>,
185}
186
187/// Result of a conditional HTTP fetch operation
188pub enum FetchResult {
189    /// Resource has not been modified since last fetch
190    NotModified {
191        /// `ETag` header value if present
192        etag: Option<String>,
193        /// `Last-Modified` header value if present
194        last_modified: Option<String>,
195    },
196    /// Resource has been modified and new content was fetched
197    Modified {
198        /// The fetched content
199        content: String,
200        /// `ETag` header value if present
201        etag: Option<String>,
202        /// `Last-Modified` header value if present
203        last_modified: Option<String>,
204        /// `SHA256` hash of the content
205        sha256: String,
206    },
207}
208
209fn calculate_sha256(content: &str) -> String {
210    let mut hasher = Sha256::new();
211    hasher.update(content.as_bytes());
212    let result = hasher.finalize();
213    STANDARD.encode(result)
214}
215
216// Note: Default is not implemented as Fetcher::new() can fail.
217// Use Fetcher::new() directly and handle the Result.
218
219#[cfg(test)]
220#[allow(
221    clippy::unwrap_used,
222    clippy::panic,
223    clippy::disallowed_macros,
224    clippy::match_wildcard_for_single_variants
225)]
226mod tests {
227    use super::*;
228    use std::time::Duration;
229    use wiremock::{
230        Mock, MockServer, ResponseTemplate,
231        matchers::{header, method, path},
232    };
233
234    #[tokio::test]
235    async fn test_fetcher_creation() {
236        // Test that fetcher can be created successfully
237        let result = Fetcher::new();
238        assert!(result.is_ok(), "Fetcher creation should succeed");
239
240        let _fetcher = result.unwrap();
241        // Verify it has the expected user agent and settings
242        // (This is implicit since we can't directly inspect the client)
243    }
244
245    #[tokio::test]
246    async fn test_fetch_with_etag_not_modified() -> anyhow::Result<()> {
247        // Setup mock server
248        let mock_server = MockServer::start().await;
249
250        // Mock 304 Not Modified response when ETag matches
251        Mock::given(method("GET"))
252            .and(path("/llms.txt"))
253            .and(header("If-None-Match", "\"test-etag\""))
254            .respond_with(ResponseTemplate::new(304))
255            .mount(&mock_server)
256            .await;
257
258        let fetcher = Fetcher::new()?;
259        let url = format!("{}/llms.txt", mock_server.uri());
260
261        // Test with matching ETag
262        let result = fetcher
263            .fetch_with_cache(&url, Some("\"test-etag\""), None)
264            .await?;
265
266        match result {
267            FetchResult::NotModified { .. } => {
268                // Expected result
269            },
270            _ => panic!("Expected NotModified result for matching ETag"),
271        }
272
273        Ok(())
274    }
275
276    #[tokio::test]
277    async fn test_fetch_with_etag_modified() -> anyhow::Result<()> {
278        // Setup mock server
279        let mock_server = MockServer::start().await;
280
281        let content = "# Test Content\n\nThis is test content.";
282
283        // Mock 200 OK response when ETag doesn't match
284        Mock::given(method("GET"))
285            .and(path("/llms.txt"))
286            .and(header("If-None-Match", "\"old-etag\""))
287            .respond_with(
288                ResponseTemplate::new(200)
289                    .set_body_string(content)
290                    .insert_header("etag", "\"new-etag\"")
291                    .insert_header("last-modified", "Wed, 21 Oct 2015 07:28:00 GMT"),
292            )
293            .mount(&mock_server)
294            .await;
295
296        let fetcher = Fetcher::new()?;
297        let url = format!("{}/llms.txt", mock_server.uri());
298
299        // Test with non-matching ETag
300        let result = fetcher
301            .fetch_with_cache(&url, Some("\"old-etag\""), None)
302            .await?;
303
304        match result {
305            FetchResult::Modified {
306                content: returned_content,
307                etag,
308                last_modified,
309                sha256,
310            } => {
311                assert_eq!(returned_content, content);
312                assert_eq!(etag, Some("\"new-etag\"".to_string()));
313                assert_eq!(
314                    last_modified,
315                    Some("Wed, 21 Oct 2015 07:28:00 GMT".to_string())
316                );
317                assert!(!sha256.is_empty(), "SHA256 should be computed");
318            },
319            _ => panic!("Expected Modified result for non-matching ETag"),
320        }
321
322        Ok(())
323    }
324
325    // Temporarily disabled - mock server setup needs adjustment
326    // #[tokio::test]
327    #[allow(dead_code)]
328    async fn test_fetch_with_last_modified() -> anyhow::Result<()> {
329        // Setup mock server
330        let mock_server = MockServer::start().await;
331
332        // Mock 304 Not Modified response when Last-Modified matches
333        Mock::given(method("GET"))
334            .and(path("/llms.txt"))
335            .and(header("If-Modified-Since", "Wed, 21 Oct 2015 07:28:00 GMT"))
336            .respond_with(ResponseTemplate::new(304))
337            .mount(&mock_server)
338            .await;
339
340        let fetcher = Fetcher::new()?;
341        let url = format!("{}/llms.txt", mock_server.uri());
342
343        // Test with Last-Modified header
344        let result = fetcher
345            .fetch_with_cache(&url, None, Some("Wed, 21 Oct 2015 07:28:00 GMT"))
346            .await?;
347
348        match result {
349            FetchResult::NotModified { .. } => {
350                // Expected result
351            },
352            _ => panic!("Expected NotModified result for matching Last-Modified"),
353        }
354
355        Ok(())
356    }
357
358    #[tokio::test]
359    async fn test_fetch_404_error() -> anyhow::Result<()> {
360        // Setup mock server
361        let mock_server = MockServer::start().await;
362
363        // Mock 404 Not Found response
364        Mock::given(method("GET"))
365            .and(path("/nonexistent.txt"))
366            .respond_with(ResponseTemplate::new(404))
367            .mount(&mock_server)
368            .await;
369
370        let fetcher = Fetcher::new()?;
371        let url = format!("{}/nonexistent.txt", mock_server.uri());
372
373        // Test 404 handling
374        let result = fetcher.fetch_with_cache(&url, None, None).await;
375
376        assert!(result.is_err(), "404 should result in error");
377
378        match result {
379            Err(Error::NotFound(msg)) => {
380                // Expected error type - 404 now maps to NotFound
381                assert!(msg.contains("not found"));
382                assert!(msg.contains("blz lookup"));
383            },
384            Err(e) => panic!("Expected NotFound error, got: {e}"),
385            Ok(_) => panic!("Expected error for 404 response"),
386        }
387
388        Ok(())
389    }
390
391    #[tokio::test]
392    async fn test_fetch_500_error() -> anyhow::Result<()> {
393        // Setup mock server
394        let mock_server = MockServer::start().await;
395
396        // Mock 500 Internal Server Error response
397        Mock::given(method("GET"))
398            .and(path("/error.txt"))
399            .respond_with(ResponseTemplate::new(500))
400            .mount(&mock_server)
401            .await;
402
403        let fetcher = Fetcher::new()?;
404        let url = format!("{}/error.txt", mock_server.uri());
405
406        // Test 500 handling
407        let result = fetcher.fetch_with_cache(&url, None, None).await;
408
409        assert!(result.is_err(), "500 should result in error");
410
411        match result {
412            Err(Error::Network(_)) => {
413                // Expected error type
414            },
415            Err(e) => panic!("Expected Network error, got: {e}"),
416            Ok(_) => panic!("Expected error for 500 response"),
417        }
418
419        Ok(())
420    }
421
422    #[tokio::test]
423    async fn test_fetch_timeout() -> anyhow::Result<()> {
424        // Setup mock server with very slow response
425        let mock_server = MockServer::start().await;
426
427        Mock::given(method("GET"))
428            .and(path("/slow.txt"))
429            .respond_with(
430                ResponseTemplate::new(200)
431                    .set_body_string("slow content")
432                    .set_delay(Duration::from_millis(500)), // Longer than custom client timeout (200ms)
433            )
434            .mount(&mock_server)
435            .await;
436
437        // Use a short timeout to keep test runtime fast
438        let fetcher = Fetcher::with_timeout(Duration::from_millis(200))?;
439        let url = format!("{}/slow.txt", mock_server.uri());
440
441        let start_time = std::time::Instant::now();
442        let result = fetcher.fetch_with_cache(&url, None, None).await;
443        let elapsed = start_time.elapsed();
444
445        // Should fail due to timeout
446        assert!(result.is_err(), "Slow request should timeout");
447        assert!(
448            elapsed < Duration::from_millis(500),
449            "Should timeout before server's 500ms delay"
450        );
451
452        Ok(())
453    }
454
455    #[tokio::test]
456    async fn test_fetch_simple_without_cache() -> anyhow::Result<()> {
457        // Setup mock server
458        let mock_server = MockServer::start().await;
459
460        let content = "# Simple Content\n\nThis is simple test content.";
461
462        Mock::given(method("GET"))
463            .and(path("/simple.txt"))
464            .respond_with(ResponseTemplate::new(200).set_body_string(content))
465            .mount(&mock_server)
466            .await;
467
468        let fetcher = Fetcher::new()?;
469        let url = format!("{}/simple.txt", mock_server.uri());
470
471        // Test simple fetch without cache headers
472        let (returned_content, sha256) = fetcher.fetch(&url).await?;
473
474        assert_eq!(returned_content, content);
475        assert!(!sha256.is_empty(), "SHA256 should be computed");
476
477        // Verify SHA256 is consistent
478        let expected_sha = calculate_sha256(content);
479        assert_eq!(sha256, expected_sha);
480
481        Ok(())
482    }
483
484    // Temporarily disabled - mock server setup needs adjustment
485    // #[tokio::test]
486    #[allow(dead_code)]
487    async fn test_fetch_with_both_etag_and_last_modified() -> anyhow::Result<()> {
488        // Setup mock server
489        let mock_server = MockServer::start().await;
490
491        // Mock response that checks both ETag and Last-Modified
492        Mock::given(method("GET"))
493            .and(path("/both.txt"))
494            .and(header("If-None-Match", "\"test-etag\""))
495            .and(header("If-Modified-Since", "Wed, 21 Oct 2015 07:28:00 GMT"))
496            .respond_with(ResponseTemplate::new(304))
497            .mount(&mock_server)
498            .await;
499
500        let fetcher = Fetcher::new()?;
501        let url = format!("{}/both.txt", mock_server.uri());
502
503        // Test with both cache headers
504        let result = fetcher
505            .fetch_with_cache(
506                &url,
507                Some("\"test-etag\""),
508                Some("Wed, 21 Oct 2015 07:28:00 GMT"),
509            )
510            .await?;
511
512        match result {
513            FetchResult::NotModified { .. } => {
514                // Expected result
515            },
516            _ => panic!("Expected NotModified result for matching cache headers"),
517        }
518
519        Ok(())
520    }
521
522    #[tokio::test]
523    async fn test_sha256_calculation() {
524        // Test the actual sha256 calculation with known values
525        let content = "Hello, World!";
526        let sha256 = calculate_sha256(content);
527
528        // The function returns base64-encoded SHA256
529        // Verify it's a valid base64 string of the right length
530        assert!(!sha256.is_empty());
531        assert_eq!(sha256.len(), 44); // Base64 encoded SHA256 is 44 chars
532
533        // Test empty string
534        let empty_sha256 = calculate_sha256("");
535        assert_eq!(empty_sha256, "47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=");
536    }
537
538    #[tokio::test]
539    async fn test_invalid_urls() -> anyhow::Result<()> {
540        let fetcher = Fetcher::new()?;
541
542        // Test completely invalid URLs
543        let invalid_urls = vec![
544            "not-a-url",
545            "ftp://invalid-protocol.com/llms.txt",
546            "",
547            "https://",
548        ];
549
550        for invalid_url in invalid_urls {
551            let result = fetcher.fetch_with_cache(invalid_url, None, None).await;
552            assert!(result.is_err(), "Invalid URL '{invalid_url}' should fail");
553        }
554
555        Ok(())
556    }
557
558    #[tokio::test]
559    async fn test_concurrent_requests() -> anyhow::Result<()> {
560        // Setup mock server
561        let mock_server = MockServer::start().await;
562
563        Mock::given(method("GET"))
564            .and(path("/concurrent.txt"))
565            .respond_with(ResponseTemplate::new(200).set_body_string("concurrent content"))
566            .mount(&mock_server)
567            .await;
568
569        let _fetcher = Fetcher::new()?;
570        let url = format!("{}/concurrent.txt", mock_server.uri());
571
572        // Make multiple concurrent requests
573        let mut handles = Vec::new();
574
575        for i in 0..10 {
576            let fetcher_clone = Fetcher::new()?;
577            let url_clone = url.clone();
578
579            handles.push(tokio::spawn(async move {
580                let result = fetcher_clone.fetch(&url_clone).await;
581                (i, result)
582            }));
583        }
584
585        // Wait for all requests
586        let results = futures::future::join_all(handles).await;
587
588        // All should succeed
589        for result in results {
590            let (index, fetch_result) = result.expect("Task should complete");
591
592            match fetch_result {
593                Ok((content, sha256)) => {
594                    assert_eq!(content, "concurrent content");
595                    assert!(!sha256.is_empty());
596                },
597                Err(e) => panic!("Request {index} should succeed: {e}"),
598            }
599        }
600
601        Ok(())
602    }
603}