1use crate::{
2 document::{Document, HeaderKind},
3 parser::{Parser, WikipediaParser},
4 Endpoint,
5};
6use anyhow::{anyhow, Context, Result};
7use reqwest::{Client, Response};
8use scraper::Html;
9use serde::{de, Deserialize, Deserializer};
10use std::fmt::Display;
11use std::str::FromStr;
12use tracing::{debug, warn};
13use url::Url;
14
15use super::languages::Language;
16
17pub mod link_data {
18 use crate::{languages::Language, search::Namespace, Endpoint};
19 use url::Url;
20
21 #[derive(Debug, Clone, PartialEq, Eq)]
22 pub struct InternalData {
23 pub namespace: Namespace,
24 pub page: String,
25 pub title: String,
26 pub endpoint: Endpoint,
27 pub language: Language,
28 pub anchor: Option<AnchorData>,
29 }
30
31 #[derive(Debug, Clone, PartialEq, Eq)]
32 pub struct AnchorData {
33 pub anchor: String,
34 pub title: String,
35 }
36
37 #[derive(Debug, Clone, PartialEq, Eq)]
38 pub struct RedLinkData {
39 pub url: Url,
40 pub title: String,
41 }
42
43 #[derive(Debug, Clone, PartialEq, Eq)]
44 pub struct MediaData {
45 pub url: Url,
46 pub title: String,
47 }
48
49 #[derive(Debug, Clone, PartialEq, Eq)]
50 pub struct ExternalData {
51 pub url: Url,
52 }
53
54 #[derive(Debug, Clone, PartialEq, Eq)]
55 pub struct ExternalToInteralData {}
56}
57
58#[derive(Debug, Clone, PartialEq, Eq)]
59pub enum Link {
60 Internal(link_data::InternalData),
62 Anchor(link_data::AnchorData),
66 RedLink(link_data::RedLinkData),
68 MediaLink(link_data::MediaData),
70 External(link_data::ExternalData),
72 ExternalToInternal(link_data::ExternalToInteralData),
74}
75
76impl Link {
77 pub fn title(&self) -> Option<&str> {
78 match self {
79 Link::Anchor(link_data) => Some(&link_data.title),
80 Link::RedLink(link_data) => Some(&link_data.title),
81 &Link::External(_) => None,
82 &Link::ExternalToInternal(_) => None,
83 Link::MediaLink(link_data) => Some(&link_data.title),
84 Link::Internal(link_data) => Some(&link_data.title),
85 }
86 }
87}
88
89#[derive(Debug, Clone, PartialEq, Eq)]
91pub struct LanguageLink {
92 pub name: String,
93 pub language: Language,
94 pub autonym: String,
95 pub title: String,
96 pub url: Url,
97 pub endpoint: Endpoint,
98}
99
100#[derive(Deserialize)]
101struct LanguageLinkInt {
102 #[serde(rename = "langname")]
103 name: String,
104 #[serde(rename = "lang")]
105 #[serde(deserialize_with = "language_from_str")]
106 language: Language,
107 autonym: String,
108 title: String,
109 url: Url,
110}
111
112fn language_from_str<'de, T, D>(deserializer: D) -> Result<T, D::Error>
113where
114 T: FromStr,
115 T::Err: Display,
116 D: Deserializer<'de>,
117{
118 String::deserialize(deserializer)?
119 .parse()
120 .map_err(de::Error::custom)
121}
122
123#[derive(Debug, Deserialize, Clone, PartialEq, Eq)]
124pub struct Section {
125 #[serde(skip_deserializing)]
126 pub index: usize,
127 #[serde(rename = "toclevel")]
128 pub header_kind: HeaderKind,
129 #[serde(rename = "line")]
130 pub text: String,
131 pub number: String,
132 pub anchor: String,
133}
134
135#[derive(Clone, PartialEq, Eq)]
136pub struct Page {
137 pub title: String,
138 pub pageid: usize,
139 pub content: Document,
140 pub language: Language,
141 pub language_links: Option<Vec<LanguageLink>>,
142 pub sections: Option<Vec<Section>>,
143 pub revision_id: Option<usize>,
144}
145
146impl Page {
147 #[cfg(debug_assertions)]
148 pub fn from_path(path: &std::path::PathBuf) -> Option<Page> {
149 if !path.exists() {
150 return None;
151 }
152
153 let content = std::fs::read_to_string(path).ok()?;
154 let nodes = WikipediaParser::parse_document(
155 &content,
156 url::Url::parse("https://en.wikipedia.org/w/api.php").ok()?,
157 Language::default(),
158 )
159 .nodes();
160
161 Some(Page {
162 title: "DEBUG: FILE".to_string(),
163 pageid: 0,
164 content: Document { nodes },
165 language: Language::default(),
166 language_links: None,
167 sections: None,
168 revision_id: None,
169 })
170 }
171
172 pub fn builder() -> PageBuilder<NoPageID, NoPage, NoEndpoint, NoLanguage> {
173 PageBuilder::default()
174 }
175
176 pub fn available_languages(&self) -> Option<usize> {
177 if let Some(ref links) = self.language_links {
178 return Some(links.len());
179 }
180 None
181 }
182
183 pub fn sections(&self) -> Option<&Vec<Section>> {
184 if let Some(ref sections) = self.sections {
185 return Some(sections);
186 }
187 None
188 }
189}
190
191impl std::fmt::Debug for Page {
192 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
193 f.debug_struct("Page")
194 .field("title", &self.title)
195 .field("pageid", &self.pageid)
196 .field("content", &self.content)
197 .field("language", &self.language)
198 .field("language_links", &self.language_links.is_some())
199 .field("sections", &self.sections.is_some())
200 .field("revision_id", &self.revision_id)
201 .finish()
202 }
203}
204
205#[derive(Clone)]
206pub enum Property {
208 Text,
210 LangLinks,
212 Categories,
214 CategoriesHTML,
216 Templates,
218 Images,
220 ExternalLinks,
222 Sections,
224 RevID,
226 DisplayTitle,
228 Subtitle,
230 HeadHTML,
232 Indicators,
234 InterwikiLinks,
236 Wikitext,
238 Properties,
240 LimitReportData,
242 LimitReportHTML,
244 ParseTree,
246 ParseWarnings,
248 ParseWarningsHTML,
250}
251
252impl Display for Property {
253 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
254 match self {
255 Property::Text => write!(f, "text"),
256 Property::LangLinks => write!(f, "langlinks"),
257 Property::Categories => write!(f, "categories"),
258 Property::CategoriesHTML => write!(f, "categorieshtml"),
259 Property::Templates => write!(f, "templates"),
260 Property::Images => write!(f, "images"),
261 Property::ExternalLinks => write!(f, "externallinks"),
262 Property::Sections => write!(f, "sections"),
263 Property::RevID => write!(f, "revid"),
264 Property::DisplayTitle => write!(f, "displaytitle"),
265 Property::Subtitle => write!(f, "subtitle"),
266 Property::HeadHTML => write!(f, "headhtml"),
267 Property::Indicators => write!(f, "indicators"),
268 Property::InterwikiLinks => write!(f, "iwlinks"),
269 Property::Wikitext => write!(f, "wikitext"),
270 Property::Properties => write!(f, "properties"),
271 Property::LimitReportData => write!(f, "limitreportdata"),
272 Property::LimitReportHTML => write!(f, "limitreporthtml"),
273 Property::ParseTree => write!(f, "parsetree"),
274 Property::ParseWarnings => write!(f, "parsewarnings"),
275 Property::ParseWarningsHTML => write!(f, "parsewarningshtml"),
276 }
277 }
278}
279
280pub struct WithPageID(usize);
281#[derive(Default)]
282pub struct NoPageID;
283
284pub struct WithPage(String);
285#[derive(Default)]
286pub struct NoPage;
287
288pub struct WithEndpoint(Url);
289#[derive(Default)]
290pub struct NoEndpoint;
291
292pub struct WithLanguage(Language);
293#[derive(Default)]
294pub struct NoLanguage;
295
296#[derive(Default)]
297pub struct PageBuilder<I, P, E, L> {
298 pageid: I,
299 page: P,
300 endpoint: E,
301 language: L,
302 revision: Option<usize>,
303 redirects: Option<bool>,
304 properties: Option<Vec<Property>>,
305}
306
307pub type PageRequest = PageBuilder<NoPageID, WithPage, WithEndpoint, WithLanguage>;
308pub type PageRequestID = PageBuilder<WithPageID, NoPage, WithEndpoint, WithLanguage>;
309
310impl<E, L> PageBuilder<NoPageID, NoPage, E, L> {
311 pub fn pageid(self, pageid: usize) -> PageBuilder<WithPageID, NoPage, E, L> {
313 PageBuilder {
314 pageid: WithPageID(pageid),
315 page: self.page,
316 endpoint: self.endpoint,
317 revision: self.revision,
318 redirects: self.redirects,
319 properties: self.properties,
320 language: self.language,
321 }
322 }
323
324 pub fn page(self, page: impl Into<String>) -> PageBuilder<NoPageID, WithPage, E, L> {
326 PageBuilder {
327 pageid: self.pageid,
328 page: WithPage(page.into()),
329 endpoint: self.endpoint,
330 revision: self.revision,
331 redirects: self.redirects,
332 properties: self.properties,
333 language: self.language,
334 }
335 }
336}
337
338impl<I, P, L> PageBuilder<I, P, NoEndpoint, L> {
339 pub fn url(self, url: impl Into<Url>) -> PageBuilder<I, P, WithEndpoint, L> {
340 PageBuilder {
341 pageid: self.pageid,
342 page: self.page,
343 endpoint: WithEndpoint(url.into()),
344 revision: self.revision,
345 redirects: self.redirects,
346 properties: self.properties,
347 language: self.language,
348 }
349 }
350
351 pub fn endpoint(self, endpoint: Url) -> PageBuilder<I, P, WithEndpoint, L> {
352 PageBuilder {
353 pageid: self.pageid,
354 page: self.page,
355 endpoint: WithEndpoint(endpoint),
356 revision: self.revision,
357 redirects: self.redirects,
358 properties: self.properties,
359 language: self.language,
360 }
361 }
362}
363
364impl<I, P, E> PageBuilder<I, P, E, NoLanguage> {
365 pub fn language(self, language: Language) -> PageBuilder<I, P, E, WithLanguage> {
366 PageBuilder {
367 pageid: self.pageid,
368 page: self.page,
369 endpoint: self.endpoint,
370 language: WithLanguage(language),
371 revision: self.revision,
372 redirects: self.redirects,
373 properties: self.properties,
374 }
375 }
376}
377
378impl<I, P, U, L> PageBuilder<I, P, U, L> {
379 pub fn revision(mut self, revision: usize) -> Self {
381 self.revision = Some(revision);
382 self
383 }
384
385 pub fn redirects(mut self, redirects: bool) -> Self {
387 self.redirects = Some(redirects);
388 self
389 }
390
391 pub fn properties(mut self, properties: Vec<Property>) -> Self {
393 self.properties = Some(properties);
394 self
395 }
396}
397
398impl<I, P> PageBuilder<I, P, WithEndpoint, WithLanguage> {
399 async fn fetch_with_params(self, mut params: Vec<(&str, String)>) -> Result<Page> {
400 async fn action_parse(params: Vec<(&str, String)>, endpoint: Url) -> Result<Response> {
401 Client::new()
402 .get(endpoint)
403 .query(&[
404 ("action", "parse"),
405 ("format", "json"),
406 ("formatversion", "2"),
407 ("parsoid", "true"),
408 ])
409 .query(¶ms)
410 .send()
411 .await
412 .map(|response| {
413 debug!("response url: '{}'", response.url().as_str());
414 response
415 })
416 .context("failed sending the request")
417 }
418
419 if let Some(revision) = self.revision {
420 params.push(("revid", revision.to_string()));
421 }
422
423 if let Some(redirects) = self.redirects {
424 params.push(("redirects", redirects.to_string()));
425 }
426
427 if let Some(ref prop) = self.properties {
428 let mut prop_str = String::new();
429 for prop in prop {
430 prop_str.push('|');
431 prop_str.push_str(&prop.to_string())
432 }
433 params.push(("prop", prop_str));
434 }
435
436 let response = action_parse(params, self.endpoint.0.clone())
437 .await?
438 .error_for_status()
439 .context("the server returned an error")?;
440
441 let res_json: serde_json::Value = serde_json::from_str(
442 &response
443 .text()
444 .await
445 .context("failed reading the response")?,
446 )
447 .context("failed interpreting the response as json")?;
448
449 self.serialize_result(res_json)
450 .context("failed serializing the returned response")
451 }
452
453 fn serialize_result(self, res_json: serde_json::Value) -> Result<Page> {
454 let title = res_json
455 .get("parse")
456 .and_then(|x| x.get("title"))
457 .and_then(|x| x.as_str())
458 .map(|x| x.to_string())
459 .ok_or_else(|| anyhow!("missing the title"))?;
460
461 let pageid = res_json
462 .get("parse")
463 .and_then(|x| x.get("pageid"))
464 .and_then(|x| x.as_u64())
465 .map(|x| x as usize)
466 .ok_or_else(|| anyhow!("missing the pageid"))?;
467
468 let endpoint = self.endpoint.0;
469 let language = self.language.0;
470 let content = res_json
471 .get("parse")
472 .and_then(|x| x.get("text"))
473 .and_then(|x| x.as_str())
474 .map(|x| {
475 let parser = WikipediaParser::parse_document(x, endpoint.clone(), language);
476 Document {
477 nodes: parser.nodes(),
478 }
479 })
480 .ok_or(anyhow!("missing the content or failed parsing the content"))?;
482
483 let language_links = res_json
484 .get("parse")
485 .and_then(|x| x.get("langlinks"))
486 .and_then(|x| x.as_array())
487 .map(|x| x.to_owned())
488 .map(|x| {
489 x.into_iter()
490 .filter_map(|x| {
491 let language_int: LanguageLinkInt = serde_json::from_value(x)
492 .map_err(|err| warn!("language_link parsing error: {:?}", err))
493 .ok()?;
494 let mut endpoint = endpoint.clone();
495 let _ = endpoint.set_host(Some(language_int.url.host_str().unwrap()));
496 Some(LanguageLink {
497 name: language_int.name,
498 language: language_int.language,
499 autonym: language_int.autonym,
500 title: language_int.title,
501 url: language_int.url,
502 endpoint,
503 })
504 })
505 .collect::<Vec<LanguageLink>>()
506 })
507 .map(|x| {
508 debug!("language_links: '{}'", x.len());
509 x
510 });
511
512 let sections = res_json
513 .get("parse")
514 .and_then(|x| x.get("sections"))
515 .and_then(|x| x.as_array())
516 .map(|x| x.to_owned())
517 .map(|x| {
518 x.into_iter()
519 .enumerate()
520 .filter_map(|(i, x)| {
521 serde_json::from_value(x).ok().map(|mut x: Section| {
522 x.index = i + 1;
523 let fragment = Html::parse_document(&x.text);
525 x.text = fragment.root_element().text().collect();
526 x
527 })
528 })
529 .collect::<Vec<Section>>()
530 })
531 .map(|mut x| {
532 x.insert(
533 0,
534 Section {
535 index: 0,
536 header_kind: HeaderKind::Main,
537 text: "(Top)".to_string(),
538 number: "".to_string(),
539 anchor: "Content_Top".to_string(),
540 },
541 );
542 x
543 });
544
545 let revision_id = res_json
546 .get("parse")
547 .and_then(|x| x.get("revid"))
548 .and_then(|x| x.as_u64())
549 .map(|x| x as usize);
550
551 Ok(Page {
552 title,
553 pageid,
554 content,
555 language,
556 language_links,
557 sections,
558 revision_id,
559 })
560 }
561}
562
563impl PageBuilder<WithPageID, NoPage, WithEndpoint, WithLanguage> {
564 pub async fn fetch(self) -> Result<Page> {
565 let param = vec![("pageid", self.pageid.0.to_string())];
566 self.fetch_with_params(param).await
567 }
568}
569
570impl PageBuilder<NoPageID, WithPage, WithEndpoint, WithLanguage> {
571 pub async fn fetch(self) -> Result<Page> {
572 let param = vec![("page", self.page.0.to_string())];
573 self.fetch_with_params(param).await
574 }
575}