Skip to main content

fetchkit/fetchers/
wikipedia.rs

1//! Wikipedia article fetcher
2//!
3//! Handles wikipedia.org/wiki/{title} URLs, returning clean article content
4//! via the MediaWiki REST API.
5
6use crate::client::FetchOptions;
7use crate::error::FetchError;
8use crate::fetchers::Fetcher;
9use crate::types::{FetchRequest, FetchResponse};
10use crate::DEFAULT_USER_AGENT;
11use async_trait::async_trait;
12use reqwest::header::{HeaderValue, ACCEPT, USER_AGENT};
13use serde::Deserialize;
14use std::time::Duration;
15use url::Url;
16
17const API_TIMEOUT: Duration = Duration::from_secs(10);
18
19/// Wikipedia fetcher
20///
21/// Matches `https://{lang}.wikipedia.org/wiki/{title}` and returns
22/// article summary and content via the MediaWiki REST API.
23pub struct WikipediaFetcher;
24
25impl WikipediaFetcher {
26    pub fn new() -> Self {
27        Self
28    }
29
30    /// Extract language and title from a Wikipedia URL
31    fn parse_url(url: &Url) -> Option<(String, String)> {
32        let host = url.host_str()?;
33
34        // Must be {lang}.wikipedia.org
35        let lang = host.strip_suffix(".wikipedia.org")?;
36        if lang.is_empty() || lang.contains('.') {
37            return None;
38        }
39
40        let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
41
42        // Must be /wiki/{title}
43        if segments.len() < 2 || segments[0] != "wiki" {
44            return None;
45        }
46
47        let title = segments[1..].join("/");
48        if title.is_empty() {
49            return None;
50        }
51
52        Some((lang.to_string(), title))
53    }
54}
55
56impl Default for WikipediaFetcher {
57    fn default() -> Self {
58        Self::new()
59    }
60}
61
62#[derive(Debug, Deserialize)]
63struct WikiSummary {
64    title: String,
65    extract: Option<String>,
66    description: Option<String>,
67    content_urls: Option<ContentUrls>,
68}
69
70#[derive(Debug, Deserialize)]
71struct ContentUrls {
72    desktop: Option<DesktopUrl>,
73}
74
75#[derive(Debug, Deserialize)]
76struct DesktopUrl {
77    page: Option<String>,
78}
79
80#[async_trait]
81impl Fetcher for WikipediaFetcher {
82    fn name(&self) -> &'static str {
83        "wikipedia"
84    }
85
86    fn matches(&self, url: &Url) -> bool {
87        Self::parse_url(url).is_some()
88    }
89
90    async fn fetch(
91        &self,
92        request: &FetchRequest,
93        options: &FetchOptions,
94    ) -> Result<FetchResponse, FetchError> {
95        let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
96
97        let (lang, title) = Self::parse_url(&url)
98            .ok_or_else(|| FetchError::FetcherError("Not a valid Wikipedia URL".to_string()))?;
99
100        let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
101        let mut client_builder = reqwest::Client::builder()
102            .connect_timeout(API_TIMEOUT)
103            .timeout(API_TIMEOUT)
104            .redirect(reqwest::redirect::Policy::limited(3));
105
106        if !options.respect_proxy_env {
107            client_builder = client_builder.no_proxy();
108        }
109
110        let client = client_builder
111            .build()
112            .map_err(FetchError::ClientBuildError)?;
113
114        let ua_header = HeaderValue::from_str(user_agent)
115            .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
116
117        // Fetch summary via REST API
118        let summary_url = format!(
119            "https://{}.wikipedia.org/api/rest_v1/page/summary/{}",
120            lang, title
121        );
122
123        let summary_resp = client
124            .get(&summary_url)
125            .header(USER_AGENT, ua_header.clone())
126            .header(ACCEPT, HeaderValue::from_static("application/json"))
127            .send()
128            .await
129            .map_err(FetchError::from_reqwest)?;
130
131        let status_code = summary_resp.status().as_u16();
132        if !summary_resp.status().is_success() {
133            let error_msg = if status_code == 404 {
134                format!("Article '{}' not found on {}.wikipedia.org", title, lang)
135            } else {
136                format!("Wikipedia API error: HTTP {}", status_code)
137            };
138            return Ok(FetchResponse {
139                url: request.url.clone(),
140                status_code,
141                error: Some(error_msg),
142                ..Default::default()
143            });
144        }
145
146        let summary: WikiSummary = summary_resp.json().await.map_err(|e| {
147            FetchError::FetcherError(format!("Failed to parse Wikipedia data: {}", e))
148        })?;
149
150        // Also fetch full HTML content and convert to markdown
151        let html_url = format!(
152            "https://{}.wikipedia.org/api/rest_v1/page/html/{}",
153            lang, title
154        );
155
156        let full_content = match client
157            .get(&html_url)
158            .header(USER_AGENT, ua_header)
159            .send()
160            .await
161        {
162            Ok(resp) if resp.status().is_success() => {
163                let html = resp.text().await.ok();
164                html.map(|h| crate::convert::html_to_markdown(&h))
165            }
166            _ => None,
167        };
168
169        let content = format_wikipedia_response(&summary, full_content.as_deref(), &lang);
170
171        Ok(FetchResponse {
172            url: request.url.clone(),
173            status_code: 200,
174            content_type: Some("text/markdown".to_string()),
175            format: Some("wikipedia".to_string()),
176            content: Some(content),
177            ..Default::default()
178        })
179    }
180}
181
182fn format_wikipedia_response(
183    summary: &WikiSummary,
184    full_content: Option<&str>,
185    lang: &str,
186) -> String {
187    let mut out = String::new();
188
189    out.push_str(&format!("# {}\n\n", summary.title));
190
191    if let Some(desc) = &summary.description {
192        out.push_str(&format!("*{}*\n\n", desc));
193    }
194
195    out.push_str(&format!("- **Language:** {}\n", lang));
196
197    if let Some(urls) = &summary.content_urls {
198        if let Some(desktop) = &urls.desktop {
199            if let Some(page) = &desktop.page {
200                out.push_str(&format!("- **URL:** {}\n", page));
201            }
202        }
203    }
204
205    // Use full content if available, otherwise use summary extract
206    if let Some(content) = full_content {
207        out.push_str(&format!("\n---\n\n{}", content));
208    } else if let Some(extract) = &summary.extract {
209        out.push_str(&format!("\n## Summary\n\n{}\n", extract));
210    }
211
212    out
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218
219    #[test]
220    fn test_parse_wikipedia_url() {
221        let url = Url::parse("https://en.wikipedia.org/wiki/Rust_(programming_language)").unwrap();
222        assert_eq!(
223            WikipediaFetcher::parse_url(&url),
224            Some(("en".to_string(), "Rust_(programming_language)".to_string()))
225        );
226    }
227
228    #[test]
229    fn test_parse_other_language() {
230        let url = Url::parse("https://de.wikipedia.org/wiki/Berlin").unwrap();
231        assert_eq!(
232            WikipediaFetcher::parse_url(&url),
233            Some(("de".to_string(), "Berlin".to_string()))
234        );
235    }
236
237    #[test]
238    fn test_rejects_non_wiki_path() {
239        let url = Url::parse("https://en.wikipedia.org/w/index.php?title=Rust").unwrap();
240        assert_eq!(WikipediaFetcher::parse_url(&url), None);
241    }
242
243    #[test]
244    fn test_rejects_non_wikipedia() {
245        let url = Url::parse("https://example.org/wiki/Test").unwrap();
246        assert_eq!(WikipediaFetcher::parse_url(&url), None);
247    }
248
249    #[test]
250    fn test_fetcher_matches() {
251        let fetcher = WikipediaFetcher::new();
252
253        let url = Url::parse("https://en.wikipedia.org/wiki/Rust").unwrap();
254        assert!(fetcher.matches(&url));
255
256        let url = Url::parse("https://example.com/wiki/Rust").unwrap();
257        assert!(!fetcher.matches(&url));
258    }
259
260    #[test]
261    fn test_format_wikipedia_response() {
262        let summary = WikiSummary {
263            title: "Rust (programming language)".to_string(),
264            extract: Some("Rust is a systems programming language.".to_string()),
265            description: Some("Programming language".to_string()),
266            content_urls: None,
267        };
268
269        let output = format_wikipedia_response(&summary, None, "en");
270
271        assert!(output.contains("# Rust (programming language)"));
272        assert!(output.contains("*Programming language*"));
273        assert!(output.contains("Rust is a systems programming language."));
274    }
275}