1use html5ever::{parse_document, tendril::TendrilSink};
2use markup5ever_rcdom::{Handle, NodeData, RcDom};
3use std::str::FromStr;
4use tracing::{trace, warn};
5use url::Url;
6
7use crate::{
8 document::{Data, HeaderKind, Raw, UnsupportedElement},
9 languages::Language,
10 page::{
11 link_data::{AnchorData, ExternalData, ExternalToInteralData, InternalData, MediaData},
12 Link,
13 },
14 search::Namespace,
15 Endpoint,
16};
17
18pub trait Parser {
20 fn parse_document(document: &str, endpoint: Endpoint, language: Language) -> Self;
21 fn nodes(self) -> Vec<Raw>;
22}
23
24pub struct WikipediaParser {
25 nodes: Vec<Raw>,
26 endpoint: Endpoint,
27 language: Language,
28}
29
30impl WikipediaParser {
31 fn parse_node(
32 &mut self,
33 node: &Handle,
34 parent: Option<usize>,
35 prev: Option<usize>,
36 ) -> Option<usize> {
37 match node.data {
38 NodeData::Document => {
39 let mut prev = None;
40 for child in node.children.borrow().iter() {
41 prev = self.parse_node(child, parent, prev)
42 }
43 None
44 }
45 NodeData::Text { ref contents } => {
46 let data = Data::Text {
47 contents: contents.borrow().to_string(),
48 };
49 Some(self.push_node(data, parent, prev))
50 }
51 NodeData::Element {
52 ref name,
53 ref attrs,
54 ..
55 } => {
56 let name = name.local.to_string();
57 let attrs: Vec<(String, String)> = attrs
58 .borrow()
59 .iter()
60 .map(|attr| (attr.name.local.to_string(), attr.value.to_string()))
61 .collect();
62
63 let mut ignore_children = false;
64
65 let data = match name.as_str() {
66 "head" | "style" | "link" => return prev,
67
68 "table" => {
69 ignore_children = true;
70 Data::Unsupported(UnsupportedElement::Table)
71 }
72 "image" => {
73 ignore_children = true;
74 Data::Unsupported(UnsupportedElement::Image)
75 }
76 "figure" => {
77 ignore_children = true;
78 Data::Unsupported(UnsupportedElement::Figure)
79 }
80 "pre" => {
81 ignore_children = true;
82 Data::Unsupported(UnsupportedElement::PreformattedText)
83 }
84
85 "span"
86 if attrs.iter().any(|(name, value)| {
87 name.as_str() == "class"
88 && (value.contains("texhtml") || value.contains("mwe-math-element"))
89 }) =>
90 {
91 ignore_children = true;
92 Data::UnsupportedInline(UnsupportedElement::MathElement)
93 }
94
95 "ul" if attrs.iter().any(|(name, value)| {
96 name.as_str() == "class" && value.contains("portalbox")
97 }) =>
98 {
99 trace!("ignoring 'ul' class: 'portalbox'");
100 return prev;
101 }
102
103 "div"
104 if attrs.iter().any(|(name, value)| {
105 name.as_str() == "class"
106 && (value.contains("toc") || value.contains("quotebox"))
107 }) =>
108 {
109 trace!("ignoring 'div': class: 'toc' || 'quotebox'");
110 return prev;
111 }
112
113 "div"
114 if attrs.iter().any(|(name, value)| {
115 name.as_str() == "class" && value.contains("mw-empty-elt")
116 }) =>
117 {
118 trace!("ignoring 'div': class: 'mw-empty-elt'");
119 return prev;
120 }
121
122 "span"
123 if attrs.iter().any(|(name, value)| {
124 name.as_str() == "class" && value.contains("cs1-maint")
125 }) =>
126 {
127 trace!("ignoring 'span': class: 'cs1-maint'");
128 return prev;
129 }
130
131 _ if attrs.iter().any(|(name, value)| {
132 name.as_str() == "class" && value.contains("noprint")
133 }) =>
134 {
135 trace!("ignoring '{name}': class: 'noprint'");
136 return prev;
137 }
138
139 "span"
140 if attrs.iter().any(|(name, value)| {
141 name.as_str() == "class" && value.contains("mw-editsection")
142 }) =>
143 {
144 trace!("ignoring 'span': class: 'mw-editsection'");
145 return prev;
146 }
147
148 "span"
149 if attrs.iter().any(|(name, value)| {
150 name.as_str() == "typeof" && value.contains("mw:Nowiki")
151 }) =>
152 {
153 trace!("ignoring 'span': class: 'mw:Nowiki'");
154 return prev;
155 }
156
157 "span"
158 if attrs.iter().any(|(name, value)| {
159 name.as_str() == "class" && value.contains("mw-reflink-text")
160 }) =>
161 {
162 Data::Reflink
163 }
164
165 "section" => self.parse_section(attrs.iter()).unwrap_or_default(),
166 "h1" => self
167 .parse_header(attrs.iter(), HeaderKind::Main)
168 .unwrap_or_default(),
169
170 "h2" => self
171 .parse_header(attrs.iter(), HeaderKind::Sub)
172 .unwrap_or_default(),
173 "h3" => self
174 .parse_header(attrs.iter(), HeaderKind::Section)
175 .unwrap_or_default(),
176 "h4" => self
177 .parse_header(attrs.iter(), HeaderKind::Subsection)
178 .unwrap_or_default(),
179 "h5" => self
180 .parse_header(attrs.iter(), HeaderKind::Minor)
181 .unwrap_or_default(),
182 "h6" => self
183 .parse_header(attrs.iter(), HeaderKind::Detail)
184 .unwrap_or_default(),
185
186 "blockquote" => Data::Blockquote,
187
188 "ol" => Data::OrderedList,
189 "ul" => Data::UnorderedList,
190 "li" => Data::ListItem,
191
192 "dl" => Data::DescriptionList,
193 "dt" => Data::DescriptionListTerm,
194 "dd" => Data::DerscriptionListDescription,
195
196 "br" => Data::Linebreak,
197
198 "b" => Data::Bold,
199 "i" => Data::Italic,
200
201 "p" => Data::Paragraph,
202 "span" => Data::Span,
203
204 "div"
205 if attrs.iter().any(|(name, value)| {
206 name.as_str() == "class" && value.contains("redirectMsg")
207 }) =>
208 {
209 Data::RedirectMessage
210 }
211
212 "div"
213 if attrs.iter().any(|(name, value)| {
214 name.as_str() == "class" && value.contains("hatnote")
215 }) =>
216 {
217 Data::Disambiguation
218 }
219
220 "a" => {
221 Self::parse_link(&self.endpoint, self.language, &attrs).unwrap_or_default()
222 }
223
224 "div" => Data::Division,
225 _ => {
226 warn!("unknown node '{name}'");
227 Data::Unknown
228 }
229 };
230
231 let index = self.push_node(data, parent, prev);
232
233 if ignore_children {
234 return Some(index);
235 }
236
237 let mut prev = None;
238 for child in node.children.borrow().iter() {
239 prev = self.parse_node(child, Some(index), prev)
240 }
241 Some(index)
242 }
243 NodeData::ProcessingInstruction { .. }
244 | NodeData::Doctype { .. }
245 | NodeData::Comment { .. } => prev,
246 }
247 }
248
249 fn push_node(&mut self, data: Data, parent: Option<usize>, prev: Option<usize>) -> usize {
250 let index = self.nodes.len();
251
252 self.nodes.push(Raw {
253 index,
254 parent,
255 prev,
256 next: None,
257 first_child: None,
258 last_child: None,
259 data,
260 });
261
262 if let Some(parent) = parent {
263 let parent = &mut self.nodes[parent];
264 if parent.first_child.is_none() {
265 parent.first_child = Some(index);
266 }
267 parent.last_child = Some(index);
268 }
269
270 if let Some(prev) = prev {
271 self.nodes[prev].next = Some(index);
272 }
273
274 index
275 }
276
277 fn parse_section<'a>(
278 &mut self,
279 mut attrs: impl Iterator<Item = &'a (String, String)>,
280 ) -> Option<Data> {
281 let section_id = attrs
282 .find(|(name, _)| name.as_str() == "data-mw-section-id")
283 .map(|(_, value)| value)?;
284 let section_id = usize::from_str(section_id)
285 .map_err(|err| warn!("section-id not a usize, '{err:?}'"))
286 .ok()?;
287
288 Some(Data::Section { id: section_id })
289 }
290
291 fn parse_header<'a>(
292 &mut self,
293 mut attrs: impl Iterator<Item = &'a (String, String)>,
294 kind: HeaderKind,
295 ) -> Option<Data> {
296 let header_id = attrs
297 .find(|(name, _)| name.as_str() == "id")
298 .map(|(_, value)| value.to_owned())?;
299
300 Some(Data::Header {
301 id: header_id,
302 kind,
303 })
304 }
305
306 fn parse_link(endpoint: &Url, language: Language, attrs: &[(String, String)]) -> Option<Data> {
307 let href = attrs
308 .iter()
309 .find(|(name, _)| name.as_str() == "href")
310 .map(|(_, value)| value.to_owned())?;
311
312 let title = attrs
313 .iter()
314 .find(|(name, _)| name.as_str() == "title")
315 .map(|(_, value)| value.to_owned())
316 .unwrap_or_default();
317
318 let link_url = endpoint.join(&href).ok()?;
319 let link_type: &str = match attrs
320 .iter()
321 .find(|(name, _)| name.as_str() == "rel")
322 .map(|(_, value)| value.to_owned())?
323 .as_str()
324 {
325 "mw:WikiLink" => "wiki",
326 "mw:MediaLink" => "media",
327 "mw:ExtLink" => "external",
328 _ => "",
329 };
330
331 let anchor = link_url.fragment().map(|fragment| AnchorData {
332 title: title.to_string(),
333 anchor: fragment.to_string(),
334 });
335
336 if link_type == "wiki" {
337 let namespace = Namespace::Main;
338
339 let is_same_wiki = link_url.domain() == endpoint.domain();
340 if !is_same_wiki {
341 return Some(Data::Link(Link::ExternalToInternal(
342 ExternalToInteralData {},
343 )));
344 }
345
346 let page = link_url.path_segments()?.last()?;
347
348 const NAMESPACE_DELIMITER: char = ':';
349 let (namespace, page) =
350 if let Some((ns_str, page_str)) = page.split_once(NAMESPACE_DELIMITER) {
351 (
352 Namespace::from_string(ns_str).unwrap_or_else(|| {
353 warn!("invalid namespace '{}', using default", ns_str);
354 namespace
355 }),
356 page_str,
357 )
358 } else {
359 (namespace, page)
360 };
361
362 let lang_str = link_url
370 .host_str()
371 .and_then(|x| x.split_once('.').map(|x| x.0));
372
373 let language = match lang_str {
374 Some(str) => Language::from_str(str).unwrap_or(language),
375 None => language,
376 };
377
378 let link_data = InternalData {
379 namespace,
380 page: page.to_string(),
381 title,
382 endpoint: endpoint.clone(),
383 language,
384 anchor,
385 };
386
387 return Some(Data::Link(Link::Internal(link_data)));
388 }
389
390 if link_type == "media" {
391 return Some(Data::Link(Link::MediaLink(MediaData {
392 url: link_url,
393 title,
394 })));
395 }
396
397 if link_type == "external" {
398 return Some(Data::Link(Link::External(ExternalData { url: link_url })));
399 }
400
401 None
402 }
403}
404
405impl Parser for WikipediaParser {
406 fn parse_document(document: &str, endpoint: Endpoint, language: Language) -> Self {
407 let mut parser = WikipediaParser {
408 nodes: Vec::new(),
409 endpoint,
410 language,
411 };
412
413 let rc_dom = parse_document(RcDom::default(), Default::default()).one(document);
414 parser.parse_node(&rc_dom.document, None, None);
415
416 parser
417 }
418
419 fn nodes(self) -> Vec<Raw> {
420 self.nodes
421 }
422}