Skip to main content

synaptic_loaders/
web_loader.rs

1use std::collections::HashMap;
2
3use crate::Document;
4use async_trait::async_trait;
5use serde_json::Value;
6use synaptic_core::SynapticError;
7
8use crate::Loader;
9
10/// Loads content from a URL via HTTP GET.
11///
12/// Uses `reqwest` to fetch the URL content and returns a single Document
13/// with the URL as id and the response text as content.
14/// Metadata includes `source` (the URL) and `content_type` (from the response header).
15pub struct WebBaseLoader {
16    url: String,
17}
18
19impl WebBaseLoader {
20    pub fn new(url: impl Into<String>) -> Self {
21        Self { url: url.into() }
22    }
23}
24
25#[async_trait]
26impl Loader for WebBaseLoader {
27    async fn load(&self) -> Result<Vec<Document>, SynapticError> {
28        let response = reqwest::get(&self.url).await.map_err(|e| {
29            SynapticError::Loader(format!("HTTP request failed for {}: {e}", self.url))
30        })?;
31
32        let content_type = response
33            .headers()
34            .get(reqwest::header::CONTENT_TYPE)
35            .and_then(|v| v.to_str().ok())
36            .unwrap_or("unknown")
37            .to_string();
38
39        let text = response
40            .text()
41            .await
42            .map_err(|e| SynapticError::Loader(format!("failed to read response body: {e}")))?;
43
44        let mut metadata = HashMap::new();
45        metadata.insert("source".to_string(), Value::String(self.url.clone()));
46        metadata.insert("content_type".to_string(), Value::String(content_type));
47
48        Ok(vec![Document::with_metadata(
49            self.url.clone(),
50            text,
51            metadata,
52        )])
53    }
54}