fetchkit/fetchers/
wikipedia.rs1use crate::client::FetchOptions;
7use crate::error::FetchError;
8use crate::fetchers::Fetcher;
9use crate::types::{FetchRequest, FetchResponse};
10use crate::DEFAULT_USER_AGENT;
11use async_trait::async_trait;
12use reqwest::header::{HeaderValue, ACCEPT, USER_AGENT};
13use serde::Deserialize;
14use std::time::Duration;
15use url::Url;
16
17const API_TIMEOUT: Duration = Duration::from_secs(10);
18
19pub struct WikipediaFetcher;
24
25impl WikipediaFetcher {
26 pub fn new() -> Self {
27 Self
28 }
29
30 fn parse_url(url: &Url) -> Option<(String, String)> {
32 let host = url.host_str()?;
33
34 let lang = host.strip_suffix(".wikipedia.org")?;
36 if lang.is_empty() || lang.contains('.') {
37 return None;
38 }
39
40 let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
41
42 if segments.len() < 2 || segments[0] != "wiki" {
44 return None;
45 }
46
47 let title = segments[1..].join("/");
48 if title.is_empty() {
49 return None;
50 }
51
52 Some((lang.to_string(), title))
53 }
54}
55
56impl Default for WikipediaFetcher {
57 fn default() -> Self {
58 Self::new()
59 }
60}
61
62#[derive(Debug, Deserialize)]
63struct WikiSummary {
64 title: String,
65 extract: Option<String>,
66 description: Option<String>,
67 content_urls: Option<ContentUrls>,
68}
69
70#[derive(Debug, Deserialize)]
71struct ContentUrls {
72 desktop: Option<DesktopUrl>,
73}
74
75#[derive(Debug, Deserialize)]
76struct DesktopUrl {
77 page: Option<String>,
78}
79
80#[async_trait]
81impl Fetcher for WikipediaFetcher {
82 fn name(&self) -> &'static str {
83 "wikipedia"
84 }
85
86 fn matches(&self, url: &Url) -> bool {
87 Self::parse_url(url).is_some()
88 }
89
90 async fn fetch(
91 &self,
92 request: &FetchRequest,
93 options: &FetchOptions,
94 ) -> Result<FetchResponse, FetchError> {
95 let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
96
97 let (lang, title) = Self::parse_url(&url)
98 .ok_or_else(|| FetchError::FetcherError("Not a valid Wikipedia URL".to_string()))?;
99
100 let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
101 let mut client_builder = reqwest::Client::builder()
102 .connect_timeout(API_TIMEOUT)
103 .timeout(API_TIMEOUT)
104 .redirect(reqwest::redirect::Policy::limited(3));
105
106 if !options.respect_proxy_env {
107 client_builder = client_builder.no_proxy();
108 }
109
110 let client = client_builder
111 .build()
112 .map_err(FetchError::ClientBuildError)?;
113
114 let ua_header = HeaderValue::from_str(user_agent)
115 .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
116
117 let summary_url = format!(
119 "https://{}.wikipedia.org/api/rest_v1/page/summary/{}",
120 lang, title
121 );
122
123 let summary_resp = client
124 .get(&summary_url)
125 .header(USER_AGENT, ua_header.clone())
126 .header(ACCEPT, HeaderValue::from_static("application/json"))
127 .send()
128 .await
129 .map_err(FetchError::from_reqwest)?;
130
131 let status_code = summary_resp.status().as_u16();
132 if !summary_resp.status().is_success() {
133 let error_msg = if status_code == 404 {
134 format!("Article '{}' not found on {}.wikipedia.org", title, lang)
135 } else {
136 format!("Wikipedia API error: HTTP {}", status_code)
137 };
138 return Ok(FetchResponse {
139 url: request.url.clone(),
140 status_code,
141 error: Some(error_msg),
142 ..Default::default()
143 });
144 }
145
146 let summary: WikiSummary = summary_resp.json().await.map_err(|e| {
147 FetchError::FetcherError(format!("Failed to parse Wikipedia data: {}", e))
148 })?;
149
150 let html_url = format!(
152 "https://{}.wikipedia.org/api/rest_v1/page/html/{}",
153 lang, title
154 );
155
156 let full_content = match client
157 .get(&html_url)
158 .header(USER_AGENT, ua_header)
159 .send()
160 .await
161 {
162 Ok(resp) if resp.status().is_success() => {
163 let html = resp.text().await.ok();
164 html.map(|h| crate::convert::html_to_markdown(&h))
165 }
166 _ => None,
167 };
168
169 let content = format_wikipedia_response(&summary, full_content.as_deref(), &lang);
170
171 Ok(FetchResponse {
172 url: request.url.clone(),
173 status_code: 200,
174 content_type: Some("text/markdown".to_string()),
175 format: Some("wikipedia".to_string()),
176 content: Some(content),
177 ..Default::default()
178 })
179 }
180}
181
182fn format_wikipedia_response(
183 summary: &WikiSummary,
184 full_content: Option<&str>,
185 lang: &str,
186) -> String {
187 let mut out = String::new();
188
189 out.push_str(&format!("# {}\n\n", summary.title));
190
191 if let Some(desc) = &summary.description {
192 out.push_str(&format!("*{}*\n\n", desc));
193 }
194
195 out.push_str(&format!("- **Language:** {}\n", lang));
196
197 if let Some(urls) = &summary.content_urls {
198 if let Some(desktop) = &urls.desktop {
199 if let Some(page) = &desktop.page {
200 out.push_str(&format!("- **URL:** {}\n", page));
201 }
202 }
203 }
204
205 if let Some(content) = full_content {
207 out.push_str(&format!("\n---\n\n{}", content));
208 } else if let Some(extract) = &summary.extract {
209 out.push_str(&format!("\n## Summary\n\n{}\n", extract));
210 }
211
212 out
213}
214
215#[cfg(test)]
216mod tests {
217 use super::*;
218
219 #[test]
220 fn test_parse_wikipedia_url() {
221 let url = Url::parse("https://en.wikipedia.org/wiki/Rust_(programming_language)").unwrap();
222 assert_eq!(
223 WikipediaFetcher::parse_url(&url),
224 Some(("en".to_string(), "Rust_(programming_language)".to_string()))
225 );
226 }
227
228 #[test]
229 fn test_parse_other_language() {
230 let url = Url::parse("https://de.wikipedia.org/wiki/Berlin").unwrap();
231 assert_eq!(
232 WikipediaFetcher::parse_url(&url),
233 Some(("de".to_string(), "Berlin".to_string()))
234 );
235 }
236
237 #[test]
238 fn test_rejects_non_wiki_path() {
239 let url = Url::parse("https://en.wikipedia.org/w/index.php?title=Rust").unwrap();
240 assert_eq!(WikipediaFetcher::parse_url(&url), None);
241 }
242
243 #[test]
244 fn test_rejects_non_wikipedia() {
245 let url = Url::parse("https://example.org/wiki/Test").unwrap();
246 assert_eq!(WikipediaFetcher::parse_url(&url), None);
247 }
248
249 #[test]
250 fn test_fetcher_matches() {
251 let fetcher = WikipediaFetcher::new();
252
253 let url = Url::parse("https://en.wikipedia.org/wiki/Rust").unwrap();
254 assert!(fetcher.matches(&url));
255
256 let url = Url::parse("https://example.com/wiki/Rust").unwrap();
257 assert!(!fetcher.matches(&url));
258 }
259
260 #[test]
261 fn test_format_wikipedia_response() {
262 let summary = WikiSummary {
263 title: "Rust (programming language)".to_string(),
264 extract: Some("Rust is a systems programming language.".to_string()),
265 description: Some("Programming language".to_string()),
266 content_urls: None,
267 };
268
269 let output = format_wikipedia_response(&summary, None, "en");
270
271 assert!(output.contains("# Rust (programming language)"));
272 assert!(output.contains("*Programming language*"));
273 assert!(output.contains("Rust is a systems programming language."));
274 }
275}