wiki_api/
parser.rs

1use html5ever::{parse_document, tendril::TendrilSink};
2use markup5ever_rcdom::{Handle, NodeData, RcDom};
3use std::str::FromStr;
4use tracing::{trace, warn};
5use url::Url;
6
7use crate::{
8    document::{Data, HeaderKind, Raw, UnsupportedElement},
9    languages::Language,
10    page::{
11        link_data::{AnchorData, ExternalData, ExternalToInteralData, InternalData, MediaData},
12        Link,
13    },
14    search::Namespace,
15    Endpoint,
16};
17
18// TODO: remove Parser and replace it with normal functions and helper functions
19pub trait Parser {
20    fn parse_document(document: &str, endpoint: Endpoint, language: Language) -> Self;
21    fn nodes(self) -> Vec<Raw>;
22}
23
24pub struct WikipediaParser {
25    nodes: Vec<Raw>,
26    endpoint: Endpoint,
27    language: Language,
28}
29
30impl WikipediaParser {
31    fn parse_node(
32        &mut self,
33        node: &Handle,
34        parent: Option<usize>,
35        prev: Option<usize>,
36    ) -> Option<usize> {
37        match node.data {
38            NodeData::Document => {
39                let mut prev = None;
40                for child in node.children.borrow().iter() {
41                    prev = self.parse_node(child, parent, prev)
42                }
43                None
44            }
45            NodeData::Text { ref contents } => {
46                let data = Data::Text {
47                    contents: contents.borrow().to_string(),
48                };
49                Some(self.push_node(data, parent, prev))
50            }
51            NodeData::Element {
52                ref name,
53                ref attrs,
54                ..
55            } => {
56                let name = name.local.to_string();
57                let attrs: Vec<(String, String)> = attrs
58                    .borrow()
59                    .iter()
60                    .map(|attr| (attr.name.local.to_string(), attr.value.to_string()))
61                    .collect();
62
63                let mut ignore_children = false;
64
65                let data = match name.as_str() {
66                    "head" | "style" | "link" => return prev,
67
68                    "table" => {
69                        ignore_children = true;
70                        Data::Unsupported(UnsupportedElement::Table)
71                    }
72                    "image" => {
73                        ignore_children = true;
74                        Data::Unsupported(UnsupportedElement::Image)
75                    }
76                    "figure" => {
77                        ignore_children = true;
78                        Data::Unsupported(UnsupportedElement::Figure)
79                    }
80                    "pre" => {
81                        ignore_children = true;
82                        Data::Unsupported(UnsupportedElement::PreformattedText)
83                    }
84
85                    "span"
86                        if attrs.iter().any(|(name, value)| {
87                            name.as_str() == "class"
88                                && (value.contains("texhtml") || value.contains("mwe-math-element"))
89                        }) =>
90                    {
91                        ignore_children = true;
92                        Data::UnsupportedInline(UnsupportedElement::MathElement)
93                    }
94
95                    "ul" if attrs.iter().any(|(name, value)| {
96                        name.as_str() == "class" && value.contains("portalbox")
97                    }) =>
98                    {
99                        trace!("ignoring 'ul' class: 'portalbox'");
100                        return prev;
101                    }
102
103                    "div"
104                        if attrs.iter().any(|(name, value)| {
105                            name.as_str() == "class"
106                                && (value.contains("toc") || value.contains("quotebox"))
107                        }) =>
108                    {
109                        trace!("ignoring 'div': class: 'toc' || 'quotebox'");
110                        return prev;
111                    }
112
113                    "div"
114                        if attrs.iter().any(|(name, value)| {
115                            name.as_str() == "class" && value.contains("mw-empty-elt")
116                        }) =>
117                    {
118                        trace!("ignoring 'div': class: 'mw-empty-elt'");
119                        return prev;
120                    }
121
122                    "span"
123                        if attrs.iter().any(|(name, value)| {
124                            name.as_str() == "class" && value.contains("cs1-maint")
125                        }) =>
126                    {
127                        trace!("ignoring 'span': class: 'cs1-maint'");
128                        return prev;
129                    }
130
131                    _ if attrs.iter().any(|(name, value)| {
132                        name.as_str() == "class" && value.contains("noprint")
133                    }) =>
134                    {
135                        trace!("ignoring '{name}': class: 'noprint'");
136                        return prev;
137                    }
138
139                    "span"
140                        if attrs.iter().any(|(name, value)| {
141                            name.as_str() == "class" && value.contains("mw-editsection")
142                        }) =>
143                    {
144                        trace!("ignoring 'span': class: 'mw-editsection'");
145                        return prev;
146                    }
147
148                    "span"
149                        if attrs.iter().any(|(name, value)| {
150                            name.as_str() == "typeof" && value.contains("mw:Nowiki")
151                        }) =>
152                    {
153                        trace!("ignoring 'span': class: 'mw:Nowiki'");
154                        return prev;
155                    }
156
157                    "span"
158                        if attrs.iter().any(|(name, value)| {
159                            name.as_str() == "class" && value.contains("mw-reflink-text")
160                        }) =>
161                    {
162                        Data::Reflink
163                    }
164
165                    "section" => self.parse_section(attrs.iter()).unwrap_or_default(),
166                    "h1" => self
167                        .parse_header(attrs.iter(), HeaderKind::Main)
168                        .unwrap_or_default(),
169
170                    "h2" => self
171                        .parse_header(attrs.iter(), HeaderKind::Sub)
172                        .unwrap_or_default(),
173                    "h3" => self
174                        .parse_header(attrs.iter(), HeaderKind::Section)
175                        .unwrap_or_default(),
176                    "h4" => self
177                        .parse_header(attrs.iter(), HeaderKind::Subsection)
178                        .unwrap_or_default(),
179                    "h5" => self
180                        .parse_header(attrs.iter(), HeaderKind::Minor)
181                        .unwrap_or_default(),
182                    "h6" => self
183                        .parse_header(attrs.iter(), HeaderKind::Detail)
184                        .unwrap_or_default(),
185
186                    "blockquote" => Data::Blockquote,
187
188                    "ol" => Data::OrderedList,
189                    "ul" => Data::UnorderedList,
190                    "li" => Data::ListItem,
191
192                    "dl" => Data::DescriptionList,
193                    "dt" => Data::DescriptionListTerm,
194                    "dd" => Data::DerscriptionListDescription,
195
196                    "br" => Data::Linebreak,
197
198                    "b" => Data::Bold,
199                    "i" => Data::Italic,
200
201                    "p" => Data::Paragraph,
202                    "span" => Data::Span,
203
204                    "div"
205                        if attrs.iter().any(|(name, value)| {
206                            name.as_str() == "class" && value.contains("redirectMsg")
207                        }) =>
208                    {
209                        Data::RedirectMessage
210                    }
211
212                    "div"
213                        if attrs.iter().any(|(name, value)| {
214                            name.as_str() == "class" && value.contains("hatnote")
215                        }) =>
216                    {
217                        Data::Disambiguation
218                    }
219
220                    "a" => {
221                        Self::parse_link(&self.endpoint, self.language, &attrs).unwrap_or_default()
222                    }
223
224                    "div" => Data::Division,
225                    _ => {
226                        warn!("unknown node '{name}'");
227                        Data::Unknown
228                    }
229                };
230
231                let index = self.push_node(data, parent, prev);
232
233                if ignore_children {
234                    return Some(index);
235                }
236
237                let mut prev = None;
238                for child in node.children.borrow().iter() {
239                    prev = self.parse_node(child, Some(index), prev)
240                }
241                Some(index)
242            }
243            NodeData::ProcessingInstruction { .. }
244            | NodeData::Doctype { .. }
245            | NodeData::Comment { .. } => prev,
246        }
247    }
248
249    fn push_node(&mut self, data: Data, parent: Option<usize>, prev: Option<usize>) -> usize {
250        let index = self.nodes.len();
251
252        self.nodes.push(Raw {
253            index,
254            parent,
255            prev,
256            next: None,
257            first_child: None,
258            last_child: None,
259            data,
260        });
261
262        if let Some(parent) = parent {
263            let parent = &mut self.nodes[parent];
264            if parent.first_child.is_none() {
265                parent.first_child = Some(index);
266            }
267            parent.last_child = Some(index);
268        }
269
270        if let Some(prev) = prev {
271            self.nodes[prev].next = Some(index);
272        }
273
274        index
275    }
276
277    fn parse_section<'a>(
278        &mut self,
279        mut attrs: impl Iterator<Item = &'a (String, String)>,
280    ) -> Option<Data> {
281        let section_id = attrs
282            .find(|(name, _)| name.as_str() == "data-mw-section-id")
283            .map(|(_, value)| value)?;
284        let section_id = usize::from_str(section_id)
285            .map_err(|err| warn!("section-id not a usize, '{err:?}'"))
286            .ok()?;
287
288        Some(Data::Section { id: section_id })
289    }
290
291    fn parse_header<'a>(
292        &mut self,
293        mut attrs: impl Iterator<Item = &'a (String, String)>,
294        kind: HeaderKind,
295    ) -> Option<Data> {
296        let header_id = attrs
297            .find(|(name, _)| name.as_str() == "id")
298            .map(|(_, value)| value.to_owned())?;
299
300        Some(Data::Header {
301            id: header_id,
302            kind,
303        })
304    }
305
306    fn parse_link(endpoint: &Url, language: Language, attrs: &[(String, String)]) -> Option<Data> {
307        let href = attrs
308            .iter()
309            .find(|(name, _)| name.as_str() == "href")
310            .map(|(_, value)| value.to_owned())?;
311
312        let title = attrs
313            .iter()
314            .find(|(name, _)| name.as_str() == "title")
315            .map(|(_, value)| value.to_owned())
316            .unwrap_or_default();
317
318        let link_url = endpoint.join(&href).ok()?;
319        let link_type: &str = match attrs
320            .iter()
321            .find(|(name, _)| name.as_str() == "rel")
322            .map(|(_, value)| value.to_owned())?
323            .as_str()
324        {
325            "mw:WikiLink" => "wiki",
326            "mw:MediaLink" => "media",
327            "mw:ExtLink" => "external",
328            _ => "",
329        };
330
331        let anchor = link_url.fragment().map(|fragment| AnchorData {
332            title: title.to_string(),
333            anchor: fragment.to_string(),
334        });
335
336        if link_type == "wiki" {
337            let namespace = Namespace::Main;
338
339            let is_same_wiki = link_url.domain() == endpoint.domain();
340            if !is_same_wiki {
341                return Some(Data::Link(Link::ExternalToInternal(
342                    ExternalToInteralData {},
343                )));
344            }
345
346            let page = link_url.path_segments()?.last()?;
347
348            const NAMESPACE_DELIMITER: char = ':';
349            let (namespace, page) =
350                if let Some((ns_str, page_str)) = page.split_once(NAMESPACE_DELIMITER) {
351                    (
352                        Namespace::from_string(ns_str).unwrap_or_else(|| {
353                            warn!("invalid namespace '{}', using default", ns_str);
354                            namespace
355                        }),
356                        page_str,
357                    )
358                } else {
359                    (namespace, page)
360                };
361
362            // we get the language from the host
363            // for wikipedia, the host looks like this
364            //      [lang].wikipedia.org/
365            // where [lang] is the language code, for example
366            //      en.wikipedia.org/
367            // for the english wikipedia
368
369            let lang_str = link_url
370                .host_str()
371                .and_then(|x| x.split_once('.').map(|x| x.0));
372
373            let language = match lang_str {
374                Some(str) => Language::from_str(str).unwrap_or(language),
375                None => language,
376            };
377
378            let link_data = InternalData {
379                namespace,
380                page: page.to_string(),
381                title,
382                endpoint: endpoint.clone(),
383                language,
384                anchor,
385            };
386
387            return Some(Data::Link(Link::Internal(link_data)));
388        }
389
390        if link_type == "media" {
391            return Some(Data::Link(Link::MediaLink(MediaData {
392                url: link_url,
393                title,
394            })));
395        }
396
397        if link_type == "external" {
398            return Some(Data::Link(Link::External(ExternalData { url: link_url })));
399        }
400
401        None
402    }
403}
404
405impl Parser for WikipediaParser {
406    fn parse_document(document: &str, endpoint: Endpoint, language: Language) -> Self {
407        let mut parser = WikipediaParser {
408            nodes: Vec::new(),
409            endpoint,
410            language,
411        };
412
413        let rc_dom = parse_document(RcDom::default(), Default::default()).one(document);
414        parser.parse_node(&rc_dom.document, None, None);
415
416        parser
417    }
418
419    fn nodes(self) -> Vec<Raw> {
420        self.nodes
421    }
422}