wiki_api/
page.rs

1use crate::{
2    document::{Document, HeaderKind},
3    parser::{Parser, WikipediaParser},
4    Endpoint,
5};
6use anyhow::{anyhow, Context, Result};
7use reqwest::{Client, Response};
8use scraper::Html;
9use serde::{Deserialize, Serialize};
10use std::fmt::Display;
11use tracing::{debug, warn};
12use url::Url;
13use uuid::Uuid;
14
15use super::languages::Language;
16
17pub mod link_data {
18    use crate::{languages::Language, search::Namespace, Endpoint};
19    use serde::{Deserialize, Serialize};
20    use url::Url;
21
22    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
23    pub struct InternalData {
24        pub namespace: Namespace,
25        pub page: String,
26        pub title: String,
27        pub endpoint: Endpoint,
28        pub language: Language,
29        pub anchor: Option<AnchorData>,
30    }
31
32    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33    pub struct AnchorData {
34        pub anchor: String,
35        pub title: String,
36    }
37
38    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
39    pub struct RedLinkData {
40        pub url: Url,
41        pub title: String,
42    }
43
44    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
45    pub struct MediaData {
46        pub url: Url,
47        pub title: String,
48    }
49
50    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51    pub struct ExternalData {
52        pub url: Url,
53    }
54
55    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
56    pub struct ExternalToInteralData {}
57}
58
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60pub enum Link {
61    /// Interal link to another page in the same wiki
62    Internal(link_data::InternalData),
63    /// Anchor to a specific section in the current page
64    /// Note: this only corresponds to anchors on the current page. For anchors in another page on
65    /// the same wiki, `LinkType::Internal` is used
66    Anchor(link_data::AnchorData),
67    /// A special type of link that leads to an internal page that doesn't exist yet
68    RedLink(link_data::RedLinkData),
69    /// Link pointing to a media
70    MediaLink(link_data::MediaData),
71    /// External link to a page at another website
72    External(link_data::ExternalData),
73    /// External link to an interal page in the same wiki
74    ExternalToInternal(link_data::ExternalToInteralData),
75}
76
77impl Link {
78    pub fn title(&self) -> Option<&str> {
79        match self {
80            Link::Anchor(link_data) => Some(&link_data.title),
81            Link::RedLink(link_data) => Some(&link_data.title),
82            &Link::External(_) => None,
83            &Link::ExternalToInternal(_) => None,
84            Link::MediaLink(link_data) => Some(&link_data.title),
85            Link::Internal(link_data) => Some(&link_data.title),
86        }
87    }
88}
89
90// TODO: replace this with Link::Internal
91#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
92pub struct LanguageLink {
93    #[serde(rename = "langname")]
94    pub name: String,
95    #[serde(rename = "lang")]
96    pub language: Language,
97    pub autonym: String,
98    pub title: String,
99    pub url: Url,
100    pub endpoint: Endpoint,
101}
102
103#[derive(Debug, Deserialize, Clone, PartialEq, Eq, Serialize)]
104pub struct Section {
105    #[serde(skip_deserializing)]
106    pub index: usize,
107    #[serde(rename = "toclevel")]
108    pub header_kind: HeaderKind,
109    #[serde(rename = "line")]
110    pub text: String,
111    pub number: String,
112    pub anchor: String,
113}
114
115#[derive(Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
116pub struct Page {
117    pub title: String,
118    pub pageid: usize,
119    pub content: Document,
120    pub language: Language,
121    pub language_links: Option<Vec<LanguageLink>>,
122    pub sections: Option<Vec<Section>>,
123    pub revision_id: Option<usize>,
124    pub uuid: Uuid,
125}
126
127impl Page {
128    #[cfg(debug_assertions)]
129    pub fn from_path(path: &std::path::PathBuf) -> Option<Page> {
130        if !path.exists() {
131            return None;
132        }
133
134        let content = std::fs::read_to_string(path).ok()?;
135        let nodes = WikipediaParser::parse_document(
136            &content,
137            url::Url::parse("https://en.wikipedia.org/w/api.php").ok()?,
138            Language::default(),
139        )
140        .nodes();
141
142        Some(Page {
143            title: "DEBUG: FILE".to_string(),
144            pageid: 0,
145            content: Document { nodes },
146            language: Language::default(),
147            language_links: None,
148            sections: None,
149            revision_id: None,
150            uuid: Uuid::new_v4(),
151        })
152    }
153
154    pub fn builder() -> PageBuilder<NoPageID, NoPage, NoEndpoint, NoLanguage> {
155        PageBuilder::default()
156    }
157
158    pub fn available_languages(&self) -> Option<usize> {
159        if let Some(ref links) = self.language_links {
160            return Some(links.len());
161        }
162        None
163    }
164
165    pub fn sections(&self) -> Option<&Vec<Section>> {
166        if let Some(ref sections) = self.sections {
167            return Some(sections);
168        }
169        None
170    }
171}
172
173impl std::fmt::Debug for Page {
174    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
175        f.debug_struct("Page")
176            .field("title", &self.title)
177            .field("pageid", &self.pageid)
178            .field("content", &self.content)
179            .field("language", &self.language)
180            .field("language_links", &self.language_links.is_some())
181            .field("sections", &self.sections.is_some())
182            .field("revision_id", &self.revision_id)
183            .finish()
184    }
185}
186
187#[derive(Clone)]
188/// Which pieces of information to get about the article
189pub enum Property {
190    /// Gives the parsed text of the wikitext
191    Text,
192    /// Gives the language links in the parsed wikitext
193    LangLinks,
194    /// Gives the categories in the parsed wikitext
195    Categories,
196    /// Gives the HTML version of the categories
197    CategoriesHTML,
198    /// Gives the templates in the parsed wikitext
199    Templates,
200    /// Gives the images in the parsed wikitext
201    Images,
202    /// Gives the external links in the parsed wikitext
203    ExternalLinks,
204    /// Gives the sections in the parsed wikitext
205    Sections,
206    /// Adds the revision ID of the parsed page
207    RevID,
208    /// Adds the title of the parsed wikitext
209    DisplayTitle,
210    /// Adds the page subtitle for the parsed page
211    Subtitle,
212    /// Gives parsed doctype, opening `<html>`, `<head>` and opening `<body>` of the page
213    HeadHTML,
214    /// Gives the HTML of page status indicators used on the page
215    Indicators,
216    /// Gives interwiki links in the parsed wikitext
217    InterwikiLinks,
218    /// Gives the original wikitext that was parsed
219    Wikitext,
220    /// Gives various properties defined in the parsed wikitext
221    Properties,
222    /// Gives the limit report in a structured way
223    LimitReportData,
224    /// Gives the HTML version of the limit report
225    LimitReportHTML,
226    /// The XML parse tree of revision content (requires content model `wikitext`)
227    ParseTree,
228    /// Gives the warnings that occurred while parsing content (as wikitext)
229    ParseWarnings,
230    /// Gives the warnings that occurred while parsing content (as HTML)
231    ParseWarningsHTML,
232}
233
234impl Display for Property {
235    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
236        match self {
237            Property::Text => write!(f, "text"),
238            Property::LangLinks => write!(f, "langlinks"),
239            Property::Categories => write!(f, "categories"),
240            Property::CategoriesHTML => write!(f, "categorieshtml"),
241            Property::Templates => write!(f, "templates"),
242            Property::Images => write!(f, "images"),
243            Property::ExternalLinks => write!(f, "externallinks"),
244            Property::Sections => write!(f, "sections"),
245            Property::RevID => write!(f, "revid"),
246            Property::DisplayTitle => write!(f, "displaytitle"),
247            Property::Subtitle => write!(f, "subtitle"),
248            Property::HeadHTML => write!(f, "headhtml"),
249            Property::Indicators => write!(f, "indicators"),
250            Property::InterwikiLinks => write!(f, "iwlinks"),
251            Property::Wikitext => write!(f, "wikitext"),
252            Property::Properties => write!(f, "properties"),
253            Property::LimitReportData => write!(f, "limitreportdata"),
254            Property::LimitReportHTML => write!(f, "limitreporthtml"),
255            Property::ParseTree => write!(f, "parsetree"),
256            Property::ParseWarnings => write!(f, "parsewarnings"),
257            Property::ParseWarningsHTML => write!(f, "parsewarningshtml"),
258        }
259    }
260}
261
262pub struct WithPageID(usize);
263#[derive(Default)]
264pub struct NoPageID;
265
266pub struct WithPage(String);
267#[derive(Default)]
268pub struct NoPage;
269
270pub struct WithEndpoint(Url);
271#[derive(Default)]
272pub struct NoEndpoint;
273
274pub struct WithLanguage(Language);
275#[derive(Default)]
276pub struct NoLanguage;
277
278#[derive(Default)]
279pub struct PageBuilder<I, P, E, L> {
280    pageid: I,
281    page: P,
282    endpoint: E,
283    language: L,
284    revision: Option<usize>,
285    redirects: Option<bool>,
286    properties: Option<Vec<Property>>,
287}
288
289pub type PageRequest = PageBuilder<NoPageID, WithPage, WithEndpoint, WithLanguage>;
290pub type PageRequestID = PageBuilder<WithPageID, NoPage, WithEndpoint, WithLanguage>;
291
292impl<E, L> PageBuilder<NoPageID, NoPage, E, L> {
293    /// Parse content of this page
294    pub fn pageid(self, pageid: usize) -> PageBuilder<WithPageID, NoPage, E, L> {
295        PageBuilder {
296            pageid: WithPageID(pageid),
297            page: self.page,
298            endpoint: self.endpoint,
299            revision: self.revision,
300            redirects: self.redirects,
301            properties: self.properties,
302            language: self.language,
303        }
304    }
305
306    /// Parse content of this page
307    pub fn page(self, page: impl Into<String>) -> PageBuilder<NoPageID, WithPage, E, L> {
308        PageBuilder {
309            pageid: self.pageid,
310            page: WithPage(page.into()),
311            endpoint: self.endpoint,
312            revision: self.revision,
313            redirects: self.redirects,
314            properties: self.properties,
315            language: self.language,
316        }
317    }
318}
319
320impl<I, P, L> PageBuilder<I, P, NoEndpoint, L> {
321    pub fn url(self, url: impl Into<Url>) -> PageBuilder<I, P, WithEndpoint, L> {
322        PageBuilder {
323            pageid: self.pageid,
324            page: self.page,
325            endpoint: WithEndpoint(url.into()),
326            revision: self.revision,
327            redirects: self.redirects,
328            properties: self.properties,
329            language: self.language,
330        }
331    }
332
333    pub fn endpoint(self, endpoint: Url) -> PageBuilder<I, P, WithEndpoint, L> {
334        PageBuilder {
335            pageid: self.pageid,
336            page: self.page,
337            endpoint: WithEndpoint(endpoint),
338            revision: self.revision,
339            redirects: self.redirects,
340            properties: self.properties,
341            language: self.language,
342        }
343    }
344}
345
346impl<I, P, E> PageBuilder<I, P, E, NoLanguage> {
347    pub fn language(self, language: Language) -> PageBuilder<I, P, E, WithLanguage> {
348        PageBuilder {
349            pageid: self.pageid,
350            page: self.page,
351            endpoint: self.endpoint,
352            language: WithLanguage(language),
353            revision: self.revision,
354            redirects: self.redirects,
355            properties: self.properties,
356        }
357    }
358}
359
360impl<I, P, U, L> PageBuilder<I, P, U, L> {
361    /// Revision ID, for `{{REVISIONID}}` and similar variables
362    pub fn revision(mut self, revision: usize) -> Self {
363        self.revision = Some(revision);
364        self
365    }
366
367    /// If page or pageid is set to a redirect, resolve it
368    pub fn redirects(mut self, redirects: bool) -> Self {
369        self.redirects = Some(redirects);
370        self
371    }
372
373    /// Which pieces of information to get
374    pub fn properties(mut self, properties: Vec<Property>) -> Self {
375        self.properties = Some(properties);
376        self
377    }
378}
379
380impl<I, P> PageBuilder<I, P, WithEndpoint, WithLanguage> {
381    async fn fetch_with_params(self, mut params: Vec<(&str, String)>) -> Result<Page> {
382        async fn action_parse(params: Vec<(&str, String)>, endpoint: Url) -> Result<Response> {
383            Client::new()
384                .get(endpoint)
385                .header(
386                    "User-Agent",
387                    format!(
388                        "wiki-tui/{} (https://github.com/Builditluc/wiki-tui)",
389                        env!("CARGO_PKG_VERSION")
390                    ),
391                )
392                .query(&[
393                    ("action", "parse"),
394                    ("format", "json"),
395                    ("formatversion", "2"),
396                    ("parsoid", "true"),
397                ])
398                .query(&params)
399                .send()
400                .await
401                .inspect(|response| {
402                    debug!("response url: '{}'", response.url().as_str());
403                })
404                .context("failed sending the request")
405        }
406
407        if let Some(revision) = self.revision {
408            params.push(("revid", revision.to_string()));
409        }
410
411        if let Some(redirects) = self.redirects {
412            params.push(("redirects", redirects.to_string()));
413        }
414
415        if let Some(ref prop) = self.properties {
416            let mut prop_str = String::new();
417            for prop in prop {
418                prop_str.push('|');
419                prop_str.push_str(&prop.to_string())
420            }
421            params.push(("prop", prop_str));
422        }
423
424        let response = action_parse(params, self.endpoint.0.clone())
425            .await?
426            .error_for_status()
427            .context("the server returned an error")?;
428
429        let res_json: serde_json::Value = serde_json::from_str(
430            &response
431                .text()
432                .await
433                .context("failed reading the response")?,
434        )
435        .context("failed interpreting the response as json")?;
436
437        self.serialize_result(res_json)
438            .context("failed serializing the returned response")
439    }
440
441    fn serialize_result(self, res_json: serde_json::Value) -> Result<Page> {
442        let title = res_json
443            .get("parse")
444            .and_then(|x| x.get("title"))
445            .and_then(|x| x.as_str())
446            .map(|x| x.to_string())
447            .ok_or_else(|| anyhow!("missing the title"))?;
448
449        let pageid = res_json
450            .get("parse")
451            .and_then(|x| x.get("pageid"))
452            .and_then(|x| x.as_u64())
453            .map(|x| x as usize)
454            .ok_or_else(|| anyhow!("missing the pageid"))?;
455
456        let endpoint = self.endpoint.0;
457        let language = self.language.0;
458        let content = res_json
459            .get("parse")
460            .and_then(|x| x.get("text"))
461            .and_then(|x| x.as_str())
462            .map(|x| {
463                let parser = WikipediaParser::parse_document(x, endpoint.clone(), language);
464                Document {
465                    nodes: parser.nodes(),
466                }
467            })
468            // HACK: implement correct errors
469            .ok_or(anyhow!("missing the content or failed parsing the content"))?;
470
471        let language_links = res_json
472            .get("parse")
473            .and_then(|x| x.get("langlinks"))
474            .and_then(|x| x.as_array())
475            .map(|x| x.to_owned())
476            .map(|x| {
477                x.into_iter()
478                    .filter_map(|x| {
479                        let mut language_link: LanguageLink = serde_json::from_value(x)
480                            .map_err(|err| warn!("language_link parsing error: {:?}", err))
481                            .ok()?;
482                        let mut endpoint = endpoint.clone();
483                        let _ = endpoint.set_host(Some(language_link.url.host_str().unwrap()));
484                        language_link.endpoint = endpoint;
485                        Some(language_link)
486                    })
487                    .collect::<Vec<LanguageLink>>()
488            })
489            .inspect(|x| {
490                debug!("language_links: '{}'", x.len());
491            });
492
493        let sections = res_json
494            .get("parse")
495            .and_then(|x| x.get("sections"))
496            .and_then(|x| x.as_array())
497            .map(|x| x.to_owned())
498            .map(|x| {
499                x.into_iter()
500                    .enumerate()
501                    .filter_map(|(i, x)| {
502                        serde_json::from_value(x).ok().map(|mut x: Section| {
503                            x.index = i + 1;
504                            // TODO: render html tags in the toc
505                            let fragment = Html::parse_document(&x.text);
506                            x.text = fragment.root_element().text().collect();
507                            x
508                        })
509                    })
510                    .collect::<Vec<Section>>()
511            })
512            .map(|mut x| {
513                x.insert(
514                    0,
515                    Section {
516                        index: 0,
517                        header_kind: HeaderKind::Main,
518                        text: "(Top)".to_string(),
519                        number: "".to_string(),
520                        anchor: "Content_Top".to_string(),
521                    },
522                );
523                x
524            });
525
526        let revision_id = res_json
527            .get("parse")
528            .and_then(|x| x.get("revid"))
529            .and_then(|x| x.as_u64())
530            .map(|x| x as usize);
531
532        Ok(Page {
533            title,
534            pageid,
535            content,
536            language,
537            language_links,
538            sections,
539            revision_id,
540            uuid: Uuid::new_v4(),
541        })
542    }
543}
544
545impl PageBuilder<WithPageID, NoPage, WithEndpoint, WithLanguage> {
546    pub async fn fetch(self) -> Result<Page> {
547        let param = vec![("pageid", self.pageid.0.to_string())];
548        self.fetch_with_params(param).await
549    }
550}
551
552impl PageBuilder<NoPageID, WithPage, WithEndpoint, WithLanguage> {
553    pub async fn fetch(self) -> Result<Page> {
554        let param = vec![("page", self.page.0.to_string())];
555        self.fetch_with_params(param).await
556    }
557}