wiki_api/
page.rs

1use crate::{
2    document::{Document, HeaderKind},
3    parser::{Parser, WikipediaParser},
4    Endpoint,
5};
6use anyhow::{anyhow, Context, Result};
7use reqwest::{Client, Response};
8use scraper::Html;
9use serde::{de, Deserialize, Deserializer};
10use std::fmt::Display;
11use std::str::FromStr;
12use tracing::{debug, warn};
13use url::Url;
14
15use super::languages::Language;
16
17pub mod link_data {
18    use crate::{languages::Language, search::Namespace, Endpoint};
19    use url::Url;
20
21    #[derive(Debug, Clone, PartialEq, Eq)]
22    pub struct InternalData {
23        pub namespace: Namespace,
24        pub page: String,
25        pub title: String,
26        pub endpoint: Endpoint,
27        pub language: Language,
28        pub anchor: Option<AnchorData>,
29    }
30
31    #[derive(Debug, Clone, PartialEq, Eq)]
32    pub struct AnchorData {
33        pub anchor: String,
34        pub title: String,
35    }
36
37    #[derive(Debug, Clone, PartialEq, Eq)]
38    pub struct RedLinkData {
39        pub url: Url,
40        pub title: String,
41    }
42
43    #[derive(Debug, Clone, PartialEq, Eq)]
44    pub struct MediaData {
45        pub url: Url,
46        pub title: String,
47    }
48
49    #[derive(Debug, Clone, PartialEq, Eq)]
50    pub struct ExternalData {
51        pub url: Url,
52    }
53
54    #[derive(Debug, Clone, PartialEq, Eq)]
55    pub struct ExternalToInteralData {}
56}
57
58#[derive(Debug, Clone, PartialEq, Eq)]
59pub enum Link {
60    /// Interal link to another page in the same wiki
61    Internal(link_data::InternalData),
62    /// Anchor to a specific section in the current page
63    /// Note: this only corresponds to anchors on the current page. For anchors in another page on
64    /// the same wiki, `LinkType::Internal` is used
65    Anchor(link_data::AnchorData),
66    /// A special type of link that leads to an internal page that doesn't exist yet
67    RedLink(link_data::RedLinkData),
68    /// Link pointing to a media
69    MediaLink(link_data::MediaData),
70    /// External link to a page at another website
71    External(link_data::ExternalData),
72    /// External link to an interal page in the same wiki
73    ExternalToInternal(link_data::ExternalToInteralData),
74}
75
76impl Link {
77    pub fn title(&self) -> Option<&str> {
78        match self {
79            Link::Anchor(link_data) => Some(&link_data.title),
80            Link::RedLink(link_data) => Some(&link_data.title),
81            &Link::External(_) => None,
82            &Link::ExternalToInternal(_) => None,
83            Link::MediaLink(link_data) => Some(&link_data.title),
84            Link::Internal(link_data) => Some(&link_data.title),
85        }
86    }
87}
88
89// TODO: replace this with Link::Internal
90#[derive(Debug, Clone, PartialEq, Eq)]
91pub struct LanguageLink {
92    pub name: String,
93    pub language: Language,
94    pub autonym: String,
95    pub title: String,
96    pub url: Url,
97    pub endpoint: Endpoint,
98}
99
100#[derive(Deserialize)]
101struct LanguageLinkInt {
102    #[serde(rename = "langname")]
103    name: String,
104    #[serde(rename = "lang")]
105    #[serde(deserialize_with = "language_from_str")]
106    language: Language,
107    autonym: String,
108    title: String,
109    url: Url,
110}
111
112fn language_from_str<'de, T, D>(deserializer: D) -> Result<T, D::Error>
113where
114    T: FromStr,
115    T::Err: Display,
116    D: Deserializer<'de>,
117{
118    String::deserialize(deserializer)?
119        .parse()
120        .map_err(de::Error::custom)
121}
122
123#[derive(Debug, Deserialize, Clone, PartialEq, Eq)]
124pub struct Section {
125    #[serde(skip_deserializing)]
126    pub index: usize,
127    #[serde(rename = "toclevel")]
128    pub header_kind: HeaderKind,
129    #[serde(rename = "line")]
130    pub text: String,
131    pub number: String,
132    pub anchor: String,
133}
134
135#[derive(Clone, PartialEq, Eq)]
136pub struct Page {
137    pub title: String,
138    pub pageid: usize,
139    pub content: Document,
140    pub language: Language,
141    pub language_links: Option<Vec<LanguageLink>>,
142    pub sections: Option<Vec<Section>>,
143    pub revision_id: Option<usize>,
144}
145
146impl Page {
147    #[cfg(debug_assertions)]
148    pub fn from_path(path: &std::path::PathBuf) -> Option<Page> {
149        if !path.exists() {
150            return None;
151        }
152
153        let content = std::fs::read_to_string(path).ok()?;
154        let nodes = WikipediaParser::parse_document(
155            &content,
156            url::Url::parse("https://en.wikipedia.org/w/api.php").ok()?,
157            Language::default(),
158        )
159        .nodes();
160
161        Some(Page {
162            title: "DEBUG: FILE".to_string(),
163            pageid: 0,
164            content: Document { nodes },
165            language: Language::default(),
166            language_links: None,
167            sections: None,
168            revision_id: None,
169        })
170    }
171
172    pub fn builder() -> PageBuilder<NoPageID, NoPage, NoEndpoint, NoLanguage> {
173        PageBuilder::default()
174    }
175
176    pub fn available_languages(&self) -> Option<usize> {
177        if let Some(ref links) = self.language_links {
178            return Some(links.len());
179        }
180        None
181    }
182
183    pub fn sections(&self) -> Option<&Vec<Section>> {
184        if let Some(ref sections) = self.sections {
185            return Some(sections);
186        }
187        None
188    }
189}
190
191impl std::fmt::Debug for Page {
192    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
193        f.debug_struct("Page")
194            .field("title", &self.title)
195            .field("pageid", &self.pageid)
196            .field("content", &self.content)
197            .field("language", &self.language)
198            .field("language_links", &self.language_links.is_some())
199            .field("sections", &self.sections.is_some())
200            .field("revision_id", &self.revision_id)
201            .finish()
202    }
203}
204
205#[derive(Clone)]
206/// Which pieces of information to get about the article
207pub enum Property {
208    /// Gives the parsed text of the wikitext
209    Text,
210    /// Gives the language links in the parsed wikitext
211    LangLinks,
212    /// Gives the categories in the parsed wikitext
213    Categories,
214    /// Gives the HTML version of the categories
215    CategoriesHTML,
216    /// Gives the templates in the parsed wikitext
217    Templates,
218    /// Gives the images in the parsed wikitext
219    Images,
220    /// Gives the external links in the parsed wikitext
221    ExternalLinks,
222    /// Gives the sections in the parsed wikitext
223    Sections,
224    /// Adds the revision ID of the parsed page
225    RevID,
226    /// Adds the title of the parsed wikitext
227    DisplayTitle,
228    /// Adds the page subtitle for the parsed page
229    Subtitle,
230    /// Gives parsed doctype, opening `<html>`, `<head>` and opening `<body>` of the page
231    HeadHTML,
232    /// Gives the HTML of page status indicators used on the page
233    Indicators,
234    /// Gives interwiki links in the parsed wikitext
235    InterwikiLinks,
236    /// Gives the original wikitext that was parsed
237    Wikitext,
238    /// Gives various properties defined in the parsed wikitext
239    Properties,
240    /// Gives the limit report in a structured way
241    LimitReportData,
242    /// Gives the HTML version of the limit report
243    LimitReportHTML,
244    /// The XML parse tree of revision content (requires content model `wikitext`)
245    ParseTree,
246    /// Gives the warnings that occurred while parsing content (as wikitext)
247    ParseWarnings,
248    /// Gives the warnings that occurred while parsing content (as HTML)
249    ParseWarningsHTML,
250}
251
252impl Display for Property {
253    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
254        match self {
255            Property::Text => write!(f, "text"),
256            Property::LangLinks => write!(f, "langlinks"),
257            Property::Categories => write!(f, "categories"),
258            Property::CategoriesHTML => write!(f, "categorieshtml"),
259            Property::Templates => write!(f, "templates"),
260            Property::Images => write!(f, "images"),
261            Property::ExternalLinks => write!(f, "externallinks"),
262            Property::Sections => write!(f, "sections"),
263            Property::RevID => write!(f, "revid"),
264            Property::DisplayTitle => write!(f, "displaytitle"),
265            Property::Subtitle => write!(f, "subtitle"),
266            Property::HeadHTML => write!(f, "headhtml"),
267            Property::Indicators => write!(f, "indicators"),
268            Property::InterwikiLinks => write!(f, "iwlinks"),
269            Property::Wikitext => write!(f, "wikitext"),
270            Property::Properties => write!(f, "properties"),
271            Property::LimitReportData => write!(f, "limitreportdata"),
272            Property::LimitReportHTML => write!(f, "limitreporthtml"),
273            Property::ParseTree => write!(f, "parsetree"),
274            Property::ParseWarnings => write!(f, "parsewarnings"),
275            Property::ParseWarningsHTML => write!(f, "parsewarningshtml"),
276        }
277    }
278}
279
280pub struct WithPageID(usize);
281#[derive(Default)]
282pub struct NoPageID;
283
284pub struct WithPage(String);
285#[derive(Default)]
286pub struct NoPage;
287
288pub struct WithEndpoint(Url);
289#[derive(Default)]
290pub struct NoEndpoint;
291
292pub struct WithLanguage(Language);
293#[derive(Default)]
294pub struct NoLanguage;
295
296#[derive(Default)]
297pub struct PageBuilder<I, P, E, L> {
298    pageid: I,
299    page: P,
300    endpoint: E,
301    language: L,
302    revision: Option<usize>,
303    redirects: Option<bool>,
304    properties: Option<Vec<Property>>,
305}
306
307pub type PageRequest = PageBuilder<NoPageID, WithPage, WithEndpoint, WithLanguage>;
308pub type PageRequestID = PageBuilder<WithPageID, NoPage, WithEndpoint, WithLanguage>;
309
310impl<E, L> PageBuilder<NoPageID, NoPage, E, L> {
311    /// Parse content of this page
312    pub fn pageid(self, pageid: usize) -> PageBuilder<WithPageID, NoPage, E, L> {
313        PageBuilder {
314            pageid: WithPageID(pageid),
315            page: self.page,
316            endpoint: self.endpoint,
317            revision: self.revision,
318            redirects: self.redirects,
319            properties: self.properties,
320            language: self.language,
321        }
322    }
323
324    /// Parse content of this page
325    pub fn page(self, page: impl Into<String>) -> PageBuilder<NoPageID, WithPage, E, L> {
326        PageBuilder {
327            pageid: self.pageid,
328            page: WithPage(page.into()),
329            endpoint: self.endpoint,
330            revision: self.revision,
331            redirects: self.redirects,
332            properties: self.properties,
333            language: self.language,
334        }
335    }
336}
337
338impl<I, P, L> PageBuilder<I, P, NoEndpoint, L> {
339    pub fn url(self, url: impl Into<Url>) -> PageBuilder<I, P, WithEndpoint, L> {
340        PageBuilder {
341            pageid: self.pageid,
342            page: self.page,
343            endpoint: WithEndpoint(url.into()),
344            revision: self.revision,
345            redirects: self.redirects,
346            properties: self.properties,
347            language: self.language,
348        }
349    }
350
351    pub fn endpoint(self, endpoint: Url) -> PageBuilder<I, P, WithEndpoint, L> {
352        PageBuilder {
353            pageid: self.pageid,
354            page: self.page,
355            endpoint: WithEndpoint(endpoint),
356            revision: self.revision,
357            redirects: self.redirects,
358            properties: self.properties,
359            language: self.language,
360        }
361    }
362}
363
364impl<I, P, E> PageBuilder<I, P, E, NoLanguage> {
365    pub fn language(self, language: Language) -> PageBuilder<I, P, E, WithLanguage> {
366        PageBuilder {
367            pageid: self.pageid,
368            page: self.page,
369            endpoint: self.endpoint,
370            language: WithLanguage(language),
371            revision: self.revision,
372            redirects: self.redirects,
373            properties: self.properties,
374        }
375    }
376}
377
378impl<I, P, U, L> PageBuilder<I, P, U, L> {
379    /// Revision ID, for `{{REVISIONID}}` and similar variables
380    pub fn revision(mut self, revision: usize) -> Self {
381        self.revision = Some(revision);
382        self
383    }
384
385    /// If page or pageid is set to a redirect, resolve it
386    pub fn redirects(mut self, redirects: bool) -> Self {
387        self.redirects = Some(redirects);
388        self
389    }
390
391    /// Which pieces of information to get
392    pub fn properties(mut self, properties: Vec<Property>) -> Self {
393        self.properties = Some(properties);
394        self
395    }
396}
397
398impl<I, P> PageBuilder<I, P, WithEndpoint, WithLanguage> {
399    async fn fetch_with_params(self, mut params: Vec<(&str, String)>) -> Result<Page> {
400        async fn action_parse(params: Vec<(&str, String)>, endpoint: Url) -> Result<Response> {
401            Client::new()
402                .get(endpoint)
403                .query(&[
404                    ("action", "parse"),
405                    ("format", "json"),
406                    ("formatversion", "2"),
407                    ("parsoid", "true"),
408                ])
409                .query(&params)
410                .send()
411                .await
412                .map(|response| {
413                    debug!("response url: '{}'", response.url().as_str());
414                    response
415                })
416                .context("failed sending the request")
417        }
418
419        if let Some(revision) = self.revision {
420            params.push(("revid", revision.to_string()));
421        }
422
423        if let Some(redirects) = self.redirects {
424            params.push(("redirects", redirects.to_string()));
425        }
426
427        if let Some(ref prop) = self.properties {
428            let mut prop_str = String::new();
429            for prop in prop {
430                prop_str.push('|');
431                prop_str.push_str(&prop.to_string())
432            }
433            params.push(("prop", prop_str));
434        }
435
436        let response = action_parse(params, self.endpoint.0.clone())
437            .await?
438            .error_for_status()
439            .context("the server returned an error")?;
440
441        let res_json: serde_json::Value = serde_json::from_str(
442            &response
443                .text()
444                .await
445                .context("failed reading the response")?,
446        )
447        .context("failed interpreting the response as json")?;
448
449        self.serialize_result(res_json)
450            .context("failed serializing the returned response")
451    }
452
453    fn serialize_result(self, res_json: serde_json::Value) -> Result<Page> {
454        let title = res_json
455            .get("parse")
456            .and_then(|x| x.get("title"))
457            .and_then(|x| x.as_str())
458            .map(|x| x.to_string())
459            .ok_or_else(|| anyhow!("missing the title"))?;
460
461        let pageid = res_json
462            .get("parse")
463            .and_then(|x| x.get("pageid"))
464            .and_then(|x| x.as_u64())
465            .map(|x| x as usize)
466            .ok_or_else(|| anyhow!("missing the pageid"))?;
467
468        let endpoint = self.endpoint.0;
469        let language = self.language.0;
470        let content = res_json
471            .get("parse")
472            .and_then(|x| x.get("text"))
473            .and_then(|x| x.as_str())
474            .map(|x| {
475                let parser = WikipediaParser::parse_document(x, endpoint.clone(), language);
476                Document {
477                    nodes: parser.nodes(),
478                }
479            })
480            // HACK: implement correct errors
481            .ok_or(anyhow!("missing the content or failed parsing the content"))?;
482
483        let language_links = res_json
484            .get("parse")
485            .and_then(|x| x.get("langlinks"))
486            .and_then(|x| x.as_array())
487            .map(|x| x.to_owned())
488            .map(|x| {
489                x.into_iter()
490                    .filter_map(|x| {
491                        let language_int: LanguageLinkInt = serde_json::from_value(x)
492                            .map_err(|err| warn!("language_link parsing error: {:?}", err))
493                            .ok()?;
494                        let mut endpoint = endpoint.clone();
495                        let _ = endpoint.set_host(Some(language_int.url.host_str().unwrap()));
496                        Some(LanguageLink {
497                            name: language_int.name,
498                            language: language_int.language,
499                            autonym: language_int.autonym,
500                            title: language_int.title,
501                            url: language_int.url,
502                            endpoint,
503                        })
504                    })
505                    .collect::<Vec<LanguageLink>>()
506            })
507            .map(|x| {
508                debug!("language_links: '{}'", x.len());
509                x
510            });
511
512        let sections = res_json
513            .get("parse")
514            .and_then(|x| x.get("sections"))
515            .and_then(|x| x.as_array())
516            .map(|x| x.to_owned())
517            .map(|x| {
518                x.into_iter()
519                    .enumerate()
520                    .filter_map(|(i, x)| {
521                        serde_json::from_value(x).ok().map(|mut x: Section| {
522                            x.index = i + 1;
523                            // TODO: render html tags in the toc
524                            let fragment = Html::parse_document(&x.text);
525                            x.text = fragment.root_element().text().collect();
526                            x
527                        })
528                    })
529                    .collect::<Vec<Section>>()
530            })
531            .map(|mut x| {
532                x.insert(
533                    0,
534                    Section {
535                        index: 0,
536                        header_kind: HeaderKind::Main,
537                        text: "(Top)".to_string(),
538                        number: "".to_string(),
539                        anchor: "Content_Top".to_string(),
540                    },
541                );
542                x
543            });
544
545        let revision_id = res_json
546            .get("parse")
547            .and_then(|x| x.get("revid"))
548            .and_then(|x| x.as_u64())
549            .map(|x| x as usize);
550
551        Ok(Page {
552            title,
553            pageid,
554            content,
555            language,
556            language_links,
557            sections,
558            revision_id,
559        })
560    }
561}
562
563impl PageBuilder<WithPageID, NoPage, WithEndpoint, WithLanguage> {
564    pub async fn fetch(self) -> Result<Page> {
565        let param = vec![("pageid", self.pageid.0.to_string())];
566        self.fetch_with_params(param).await
567    }
568}
569
570impl PageBuilder<NoPageID, WithPage, WithEndpoint, WithLanguage> {
571    pub async fn fetch(self) -> Result<Page> {
572        let param = vec![("page", self.page.0.to_string())];
573        self.fetch_with_params(param).await
574    }
575}