1use crate::error::Result;
2use crate::parser::words::wikitext_to_words;
3use crate::parser::xml::{read_relevant_event, RelevantEvent};
4use crate::Error;
5use async_compression::tokio::bufread::BzDecoder;
6use log::{debug, info, trace, warn};
7use quick_xml::events::attributes::Attributes;
8use quick_xml::name::QName;
9use quick_xml::Reader;
10use serde::Deserialize;
11use serde::Serialize;
12use std::ffi::OsStr;
13use std::future::Future;
14use std::io::{Read, Write};
15use std::path::Path;
16use std::pin::Pin;
17use std::task::{Context, Poll};
18use tokio::fs::File;
19use tokio::io::{
20 AsyncBufRead, AsyncRead, AsyncSeekExt, AsyncWrite, AsyncWriteExt, BufReader, BufWriter, ReadBuf,
21};
22use tokio::time::Duration;
23use tokio::time::Instant;
24use wikitext_parser::{parse_wikitext, Wikitext};
25
26use self::words::Word;
27
28pub mod words;
29mod xml;
30
31#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
32pub struct Siteinfo {
33 sitename: String,
34 dbname: String,
35 base: String,
36 generator: String,
37 case: String,
38 namespaces: Vec<Namespace>,
39}
40
41#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
42pub struct Namespace {
43 key: i64,
44 case: String,
45 name: String,
46}
47
48struct TokioReadAdapter<R>(R);
49
50impl<R: Read + Unpin> AsyncRead for TokioReadAdapter<R> {
51 fn poll_read(
52 mut self: Pin<&mut Self>,
53 _cx: &mut Context<'_>,
54 buf: &mut ReadBuf<'_>,
55 ) -> Poll<std::io::Result<()>> {
56 let amount = self.0.read(buf.initialize_unfilled());
57 Poll::Ready(amount.map(|amount| {
58 buf.advance(amount);
59 }))
60 }
61}
62
63pub async fn parse_dump_file<
64 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
65>(
66 input_file: impl AsRef<Path>,
67 output_file: Option<impl AsRef<Path>>,
68 mut word_consumer: impl FnMut(Word) -> WordConsumerResult,
69 error_log: impl AsRef<Path>,
70 output_pretty: bool,
71) -> Result<()> {
72 let input_file = input_file.as_ref();
73 let output_file = output_file.as_ref();
74
75 if input_file.extension().map(OsStr::to_str) == Some(Some("bz2")) {
76 if input_file
77 .file_stem()
78 .map(|stem| stem.to_str().filter(|stem| stem.ends_with("xml")).is_some())
79 .is_none()
80 {
81 return Err(Error::Other(format!("Found a '.bz2' file extension that is not preceded by a '.xml' file extension in file {input_file:?}")));
82 }
83
84 debug!("Found file extension '.xml.bz2' for input file {input_file:?}");
85
86 let input_file = File::open(input_file).await?;
87 let input_size = input_file.metadata().await?.len();
88 let input_stream = BufReader::with_capacity(
89 1024 * 1024,
90 BzDecoder::new(BufReader::with_capacity(1024 * 1024, input_file)),
91 );
92
93 let output_stream = if let Some(output_file) = output_file {
94 Some(BufWriter::with_capacity(
95 1024 * 1024,
96 File::create(output_file).await?,
97 ))
98 } else {
99 None
100 };
101 let error_log = std::io::BufWriter::new(std::fs::File::create(error_log)?);
102
103 parse_dump_file_with_streams(
105 input_stream,
106 |input_stream| input_stream.get_mut().get_mut().get_mut(),
107 input_size,
108 output_stream,
109 &mut word_consumer,
110 error_log,
111 output_pretty,
112 )
113 .await?;
114 } else if input_file
115 .extension()
116 .filter(|extension| extension.to_str() == Some("xml"))
117 .is_some()
118 {
119 debug!("Found file extension '.xml' for input file {input_file:?}");
120
121 let input_file = File::open(input_file).await?;
122 let input_size = input_file.metadata().await?.len();
123 let input_stream = BufReader::with_capacity(1024 * 1024, input_file);
124 let output_stream = if let Some(output_file) = output_file {
125 Some(BufWriter::with_capacity(
126 1024 * 1024,
127 File::create(output_file).await?,
128 ))
129 } else {
130 None
131 };
132 let error_log = std::io::BufWriter::new(std::fs::File::create(error_log)?);
133
134 parse_dump_file_with_streams(
135 input_stream,
136 |input_stream| input_stream.get_mut(),
137 input_size,
138 output_stream,
139 &mut word_consumer,
140 error_log,
141 output_pretty,
142 )
143 .await?;
144 } else {
145 return Err(Error::Other(format!(
146 "Unknown file extension in file {input_file:?}"
147 )));
148 }
149
150 Ok(())
151}
152
153#[allow(clippy::type_complexity)]
154async fn parse_dump_file_with_streams<
155 InputStream: AsyncBufRead + Unpin,
156 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
157>(
158 input_stream: InputStream,
159 input_stream_to_file: impl Fn(&mut InputStream) -> &mut File,
160 input_size: u64,
161 mut output_stream: Option<impl AsyncWrite + Unpin>,
162 word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
163 mut error_log: impl Write,
164 output_pretty: bool,
165) -> Result<()> {
166 let mut reader = Reader::from_reader(input_stream);
167 let mut buffer = Vec::new();
168 let mut last_progress_log = Instant::now();
169 let mut tag_stack = Vec::new();
170 let mut json_buffer = Vec::new();
171
172 loop {
173 let current_time = Instant::now();
174 if current_time - last_progress_log >= Duration::from_secs(10) {
175 last_progress_log = current_time;
176
177 let input_file = input_stream_to_file(reader.get_mut());
178 let current = input_file.stream_position().await?;
179 let current_mib = current / (1024 * 1024);
180 let input_size_mib = input_size / (1024 * 1024);
181
182 info!("Parsing input file at {current_mib}/{input_size_mib}MiB");
183 }
184
185 let level = tag_stack.len();
186 match read_relevant_event(&mut reader, &mut buffer).await {
187 Ok(event) => match event {
188 RelevantEvent::Start(tag) => {
189 let tag_name = String::from_utf8(tag.name().into_inner().to_vec())?;
190 if level == 0 {
191 if tag_name != "mediawiki" {
192 return Err(Error::Other(format!(
193 "Found unexpected toplevel tag {tag:?}"
194 )));
195 }
196 tag_stack.push(tag_name);
197 } else if level == 1 {
198 match tag_name.as_str() {
199 "siteinfo" => {
200 let siteinfo =
201 parse_siteinfo(tag.attributes(), &mut reader, &mut buffer)
202 .await?;
203 info!(
204 "{} ({} {})",
205 siteinfo.sitename, siteinfo.dbname, siteinfo.generator
206 );
207 if let Some(output_stream) = output_stream.as_mut() {
208 json_buffer.clear();
209 if output_pretty {
210 serde_json::to_writer_pretty(&mut json_buffer, &siteinfo)?;
211 } else {
212 serde_json::to_writer(&mut json_buffer, &siteinfo)?;
213 }
214 output_stream.write_all(&json_buffer).await?;
215 }
216 }
217 "page" => {
218 let page = parse_page(
219 tag.attributes(),
220 &mut reader,
221 word_consumer,
222 &mut buffer,
223 &mut error_log,
224 )
225 .await?;
226 trace!("{page:?}");
227 if let Some(output_stream) = output_stream.as_mut() {
228 json_buffer.clear();
229 if output_pretty {
230 serde_json::to_writer_pretty(&mut json_buffer, &page)?;
231 } else {
232 serde_json::to_writer(&mut json_buffer, &page)?;
233 }
234 output_stream.write_all(&json_buffer).await?;
235 }
236 }
237 _ => {
238 return Err(Error::Other(format!(
239 "Found unexpected level 1 tag {tag:?}"
240 )))
241 }
242 }
243 }
244 }
245 RelevantEvent::End(tag) => {
246 let tag_name = String::from_utf8(tag.name().into_inner().to_vec())?;
247 let stacked_tag = tag_stack
248 .pop()
249 .ok_or_else(|| Error::Other(format!("Unexpected closing tag {tag:?}")))?;
250 if tag_name != stacked_tag {
251 return Err(Error::Other(format!("Unexpected closing tag {tag:?}")));
252 }
253 }
254 RelevantEvent::Empty(tag) => {
255 return Err(Error::Other(format!("Unexpected empty tag {tag:?}")));
256 }
257 RelevantEvent::Text(text) => {
258 return Err(Error::Other(format!("Unexpected text {text:?}")));
259 }
260 RelevantEvent::Eof => {
261 if level > 0 {
262 return Err(Error::Other(format!("Unexpected eof")));
263 } else {
264 break;
265 }
266 }
267 },
268 Err(error) => return Err(error),
269 }
270 }
271
272 info!("Successfully parsed dump file");
273 Ok(())
274}
275
276async fn parse_siteinfo(
277 mut attributes: Attributes<'_>,
278 reader: &mut Reader<impl AsyncBufRead + Unpin>,
279 buffer: &mut Vec<u8>,
280) -> Result<Siteinfo> {
281 if let Some(attribute) = attributes.next() {
282 return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
283 }
284
285 let mut sitename = None;
286 let mut dbname = None;
287 let mut base = None;
288 let mut generator = None;
289 let mut case = None;
290 let mut namespaces = None;
291
292 loop {
293 match read_relevant_event(reader, buffer).await? {
294 RelevantEvent::Start(tag) => match tag.name().into_inner() {
295 b"sitename" => {
296 sitename =
297 Some(parse_string("sitename", tag.attributes(), reader, buffer).await?);
298 }
299 b"dbname" => {
300 dbname = Some(parse_string("dbname", tag.attributes(), reader, buffer).await?);
301 }
302 b"base" => {
303 base = Some(parse_string("base", tag.attributes(), reader, buffer).await?);
304 }
305 b"generator" => {
306 generator =
307 Some(parse_string("generator", tag.attributes(), reader, buffer).await?);
308 }
309 b"case" => {
310 case = Some(parse_string("case", tag.attributes(), reader, buffer).await?);
311 }
312 b"namespaces" => {
313 namespaces = Some(parse_namespaces(tag.attributes(), reader, buffer).await?);
314 }
315 _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
316 },
317 RelevantEvent::End(tag) => {
318 return if tag.name() == QName(b"siteinfo") {
319 Ok(Siteinfo {
320 sitename: if let Some(sitename) = sitename {
321 sitename
322 } else {
323 return Err(Error::Other(format!("Missing sitename in siteinfo")));
324 },
325 dbname: if let Some(dbname) = dbname {
326 dbname
327 } else {
328 return Err(Error::Other(format!("Missing dbname in siteinfo")));
329 },
330 base: if let Some(base) = base {
331 base
332 } else {
333 return Err(Error::Other(format!("Missing base in siteinfo")));
334 },
335 generator: if let Some(generator) = generator {
336 generator
337 } else {
338 return Err(Error::Other(format!("Missing generator in siteinfo")));
339 },
340 case: if let Some(case) = case {
341 case
342 } else {
343 return Err(Error::Other(format!("Missing case in siteinfo")));
344 },
345 namespaces: if let Some(namespaces) = namespaces {
346 namespaces
347 } else {
348 return Err(Error::Other(format!("Missing namespaces in siteinfo")));
349 },
350 })
351 } else {
352 Err(Error::Other(format!(
353 "Found unexpected closing tag {tag:?}"
354 )))
355 };
356 }
357 RelevantEvent::Empty(tag) => {
358 warn!("{tag:?}")
359 }
360 RelevantEvent::Text(text) => {
361 warn!("{text:?}")
362 }
363 RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
364 }
365 }
366}
367
368async fn parse_namespaces(
369 mut attributes: Attributes<'_>,
370 reader: &mut Reader<impl AsyncBufRead + Unpin>,
371 buffer: &mut Vec<u8>,
372) -> Result<Vec<Namespace>> {
373 if let Some(attribute) = attributes.next() {
374 return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
375 }
376
377 struct NamespaceTag {
378 key: i64,
379 case: String,
380 }
381 let mut current_namespace_tag = None;
382 let mut namespaces = Vec::new();
383
384 loop {
385 match read_relevant_event(reader, buffer).await? {
386 RelevantEvent::Start(tag) => {
387 if tag.name() == QName(b"namespace") {
388 if current_namespace_tag.is_some() {
389 return Err(Error::Other(format!("Found nested namespace tag {tag:?}")));
390 }
391
392 current_namespace_tag = Some(NamespaceTag {
393 key: String::from_utf8_lossy(
394 &tag.try_get_attribute(b"key")?
395 .ok_or_else(|| {
396 Error::Other(format!("Missing attribute key in {tag:?}"))
397 })?
398 .value,
399 )
400 .parse()
401 .map_err(|_| Error::Other(format!("Key is not an integer in {tag:?}")))?,
402 case: String::from_utf8_lossy(
403 &tag.try_get_attribute(b"case")?
404 .ok_or_else(|| {
405 Error::Other(format!("Missing attribute case in {tag:?}"))
406 })?
407 .value,
408 )
409 .into_owned(),
410 });
411 } else {
412 return Err(Error::Other(format!("Found unexpected tag {tag:?}")));
413 }
414 }
415 RelevantEvent::End(tag) => {
416 if tag.name() == QName(b"namespaces") {
417 return Ok(namespaces);
418 } else if tag.name() == QName(b"namespace") {
419 if current_namespace_tag.is_some() {
420 return Err(Error::Other(format!(
421 "Found namespace tag without text {tag:?}"
422 )));
423 }
424 } else {
425 return Err(Error::Other(format!(
426 "Found unexpected closing tag {tag:?}"
427 )));
428 };
429 }
430 RelevantEvent::Empty(tag) => {
431 match tag.name().into_inner() {
432 b"namespace" => { }
433 _ => warn!("{tag:?}"),
434 }
435 }
436 RelevantEvent::Text(text) => {
437 if let Some(current_namespace_tag) = current_namespace_tag {
438 namespaces.push(Namespace {
439 key: current_namespace_tag.key,
440 case: current_namespace_tag.case,
441 name: text,
442 });
443 } else {
444 return Err(Error::Other(format!(
445 "Found text outside of namespace tag: {text:?}"
446 )));
447 }
448
449 current_namespace_tag = None;
450 }
451 RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
452 }
453 }
454}
455
456#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
457pub struct Page {
458 title: String,
459 namespace: i64,
460 id: i64,
461 revision: Revision,
462 redirect: Option<String>,
463}
464
465async fn parse_page<
466 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
467>(
468 mut attributes: Attributes<'_>,
469 reader: &mut Reader<impl AsyncBufRead + Unpin>,
470 word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
471 buffer: &mut Vec<u8>,
472 error_log: &mut impl Write,
473) -> Result<Page> {
474 if let Some(attribute) = attributes.next() {
475 return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
476 }
477
478 let mut title = None;
479 let mut namespace = None;
480 let mut id = None;
481 let mut revision = None;
482 let mut redirect = None;
483
484 loop {
485 match read_relevant_event(reader, buffer).await? {
486 RelevantEvent::Start(tag) => match tag.name().into_inner() {
487 b"title" => {
488 title = Some(parse_string("title", tag.attributes(), reader, buffer).await?);
489 }
490 b"ns" => {
491 namespace = Some(
492 parse_string("ns", tag.attributes(), reader, buffer)
493 .await?
494 .parse()
495 .map_err(|_| {
496 Error::Other(format!("ns is not an integer in {tag:?}"))
497 })?,
498 );
499 }
500 b"id" => {
501 id = Some(
502 parse_string("id", tag.attributes(), reader, buffer)
503 .await?
504 .parse()
505 .map_err(|_| {
506 Error::Other(format!("id is not an integer in {tag:?}"))
507 })?,
508 );
509 }
510 b"revision" => {
511 revision = Some(
512 parse_revision(
513 tag.attributes(),
514 title.clone(),
515 reader,
516 word_consumer,
517 buffer,
518 error_log,
519 )
520 .await?,
521 );
522 }
523 _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
524 },
525 RelevantEvent::End(tag) => {
526 return if tag.name() == QName(b"page") {
527 Ok(Page {
528 title: if let Some(title) = title {
529 title
530 } else {
531 return Err(Error::Other(format!("Missing title in page")));
532 },
533 namespace: if let Some(namespace) = namespace {
534 namespace
535 } else {
536 return Err(Error::Other(format!("Missing namespace in page")));
537 },
538 id: if let Some(id) = id {
539 id
540 } else {
541 return Err(Error::Other(format!("Missing id in page")));
542 },
543 revision: if let Some(revision) = revision {
544 revision
545 } else {
546 return Err(Error::Other(format!("Missing revision in page")));
547 },
548 redirect,
549 })
550 } else {
551 Err(Error::Other(format!(
552 "Found unexpected closing tag {tag:?}"
553 )))
554 };
555 }
556 RelevantEvent::Empty(tag) => match tag.name().into_inner() {
557 b"redirect" => {
558 for attribute in tag.attributes() {
559 let attribute = attribute?;
560 match attribute.key {
561 QName(b"title") => {
562 redirect = Some(String::from_utf8(attribute.value.to_vec())?);
563 }
564 _ => warn!("{tag:?} {attribute:?}"),
565 }
566 }
567 }
568 _ => warn!("{tag:?}"),
569 },
570 RelevantEvent::Text(text) => {
571 warn!("{text:?}")
572 }
573 RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
574 }
575 }
576}
577
578#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
579pub struct Revision {
580 id: i64,
581 parentid: Option<i64>,
582 timestamp: String,
583 contributor: Option<Contributor>,
584 comment: Option<String>,
585 model: String,
586 format: String,
587 text: Option<Text>,
588 sha1: String,
589 minor: bool,
590}
591
592async fn parse_revision<
593 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
594>(
595 mut attributes: Attributes<'_>,
596 title: Option<String>,
597 reader: &mut Reader<impl AsyncBufRead + Unpin>,
598 word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
599 buffer: &mut Vec<u8>,
600 error_log: &mut impl Write,
601) -> Result<Revision> {
602 if let Some(attribute) = attributes.next() {
603 return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
604 }
605
606 let mut id = None;
607 let mut parentid = None;
608 let mut timestamp = None;
609 let mut contributor = None;
610 let mut comment = None;
611 let mut model = None;
612 let mut format = None;
613 let mut text = None;
614 let mut sha1 = None;
615 let mut minor = false;
616
617 loop {
618 match read_relevant_event(reader, buffer).await? {
619 RelevantEvent::Start(tag) => match tag.name().into_inner() {
620 b"id" => {
621 id = Some(
622 parse_string("id", tag.attributes(), reader, buffer)
623 .await?
624 .parse()
625 .map_err(|_| {
626 Error::Other(format!("id is not an integer in {tag:?}"))
627 })?,
628 );
629 }
630 b"parentid" => {
631 parentid = Some(
632 parse_string("parentid", tag.attributes(), reader, buffer)
633 .await?
634 .parse()
635 .map_err(|_| {
636 Error::Other(format!("parentid is not an integer in {tag:?}"))
637 })?,
638 );
639 }
640 b"timestamp" => {
641 timestamp =
642 Some(parse_string("timestamp", tag.attributes(), reader, buffer).await?);
643 }
644 b"contributor" => {
645 contributor = Some(parse_contributor(tag.attributes(), reader, buffer).await?);
646 }
647 b"comment" => {
648 comment =
649 Some(parse_string("comment", tag.attributes(), reader, buffer).await?);
650 }
651 b"model" => {
652 model = Some(parse_string("model", tag.attributes(), reader, buffer).await?);
653 }
654 b"format" => {
655 format = Some(parse_string("format", tag.attributes(), reader, buffer).await?);
656 }
657 b"text" => {
658 text = Some(
659 parse_text(
660 tag.attributes(),
661 title.as_deref(),
662 reader,
663 word_consumer,
664 buffer,
665 error_log,
666 )
667 .await?,
668 );
669 }
670 b"sha1" => {
671 sha1 = Some(parse_string("sha1", tag.attributes(), reader, buffer).await?);
672 }
673 _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
674 },
675 RelevantEvent::End(tag) => {
676 return if tag.name() == QName(b"revision") {
677 if text.is_none() {
678 debug!("No text for revision with id {id:?} and comment {comment:?}");
679 }
680
681 Ok(Revision {
682 id: if let Some(id) = id {
683 id
684 } else {
685 return Err(Error::Other(format!("Missing id in revision")));
686 },
687 parentid,
688 timestamp: if let Some(timestamp) = timestamp {
689 timestamp
690 } else {
691 return Err(Error::Other(format!("Missing timestamp in revision")));
692 },
693 contributor,
694 comment,
695 model: if let Some(model) = model {
696 model
697 } else {
698 return Err(Error::Other(format!("Missing model in revision")));
699 },
700 format: if let Some(format) = format {
701 format
702 } else {
703 return Err(Error::Other(format!("Missing format in revision")));
704 },
705 text,
706 sha1: if let Some(sha1) = sha1 {
707 sha1
708 } else {
709 return Err(Error::Other(format!("Missing sha1 in revision")));
710 },
711 minor,
712 })
713 } else {
714 Err(Error::Other(format!(
715 "Found unexpected closing tag {tag:?}"
716 )))
717 };
718 }
719 RelevantEvent::Empty(tag) => {
720 match tag.name().into_inner() {
721 b"minor" => {
722 minor = true;
723 }
724 b"comment" => { }
725 b"text" => { }
726 b"contributor" => { }
727 _ => warn!("{tag:?}"),
728 }
729 }
730 RelevantEvent::Text(text) => {
731 warn!("{text:?}")
732 }
733 RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
734 }
735 }
736}
737
738#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
739pub enum Contributor {
740 User { username: String, id: i64 },
741 Anonymous { ip: String },
742}
743
744async fn parse_contributor(
745 mut attributes: Attributes<'_>,
746 reader: &mut Reader<impl AsyncBufRead + Unpin>,
747 buffer: &mut Vec<u8>,
748) -> Result<Contributor> {
749 if let Some(attribute) = attributes.next() {
750 return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
751 }
752
753 let mut username = None;
754 let mut id: Option<i64> = None;
755 let mut ip = None;
756
757 loop {
758 match read_relevant_event(reader, buffer).await? {
759 RelevantEvent::Start(tag) => match tag.name().into_inner() {
760 b"username" => {
761 username =
762 Some(parse_string("username", tag.attributes(), reader, buffer).await?);
763 }
764 b"id" => {
765 id = Some(
766 parse_string("id", tag.attributes(), reader, buffer)
767 .await?
768 .parse()
769 .map_err(|_| {
770 Error::Other(format!("id is not an integer in {tag:?}"))
771 })?,
772 );
773 }
774 b"ip" => {
775 ip = Some(parse_string("ip", tag.attributes(), reader, buffer).await?);
776 }
777 _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
778 },
779 RelevantEvent::End(tag) => {
780 return if tag.name() == QName(b"contributor") {
781 if let (Some(username), Some(id), None) = (&username, &id, &ip) {
782 Ok(Contributor::User {
783 username: username.clone(),
784 id: *id,
785 })
786 } else if let (None, None, Some(ip)) = (&username, &id, &ip) {
787 Ok(Contributor::Anonymous { ip: ip.clone() })
788 } else {
789 Err(Error::Other(format!("Unknown combination of fields for contributor: {username:?}, {id:?}, {ip:?}")))
790 }
791 } else {
792 Err(Error::Other(format!(
793 "Found unexpected closing tag {tag:?}"
794 )))
795 };
796 }
797 RelevantEvent::Empty(tag) => {
798 warn!("{tag:?}")
799 }
800 RelevantEvent::Text(text) => {
801 warn!("{text:?}")
802 }
803 RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
804 }
805 }
806}
807
808#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
809pub struct Text {
810 xml_space: XmlSpace,
811 text: Wikitext,
812}
813
814#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
815pub enum XmlSpace {
816 Preserve,
817}
818
819async fn parse_text<
820 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
821>(
822 attributes: Attributes<'_>,
823 title: Option<&str>,
824 reader: &mut Reader<impl AsyncBufRead + Unpin>,
825 mut word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
826 buffer: &mut Vec<u8>,
827 error_log: &mut impl Write,
828) -> Result<Text> {
829 let mut bytes: Option<usize> = None;
830 let mut xml_space = None;
831
832 for attribute in attributes {
833 let attribute = attribute?;
834 match attribute.key.into_inner() {
835 b"bytes" => {
836 bytes = Some(
837 String::from_utf8(attribute.value.to_vec())?
838 .parse()
839 .map_err(|_| {
840 Error::Other(format!("bytes is not an integer in {attribute:?}"))
841 })?,
842 );
843 }
844 b"xml:space" => {
845 xml_space = Some(match attribute.value.as_ref() {
846 b"preserve" => XmlSpace::Preserve,
847 _ => {
848 return Err(Error::Other(format!(
849 "Found unexpected attribute value {attribute:?}"
850 )))
851 }
852 });
853 }
854 _ => {
855 return Err(Error::Other(format!(
856 "Found unexpected attribute {attribute:?}"
857 )))
858 }
859 }
860 }
861
862 let mut text = None;
863
864 loop {
865 match read_relevant_event(reader, buffer).await? {
866 RelevantEvent::Start(tag) => {
867 return Err(Error::Other(format!("Found unexpected tag {tag:?}")));
868 }
869 RelevantEvent::End(tag) => {
870 return if tag.name() == QName(b"text") {
871 Ok(Text {
872 xml_space: if let Some(xml_space) = xml_space {
873 xml_space
874 } else {
875 return Err(Error::Other(format!("Missing tag xml:space in text")));
876 },
877 text: if let Some(text) = text {
878 text
879 } else {
880 return Err(Error::Other(format!("Missing text in text")));
881 },
882 })
883 } else {
884 Err(Error::Other(format!(
885 "Found unexpected closing tag {tag:?}"
886 )))
887 };
888 }
889 RelevantEvent::Empty(tag) => {
890 warn!("{tag:?}")
891 }
892 RelevantEvent::Text(raw_text) => {
893 if let Some(bytes) = bytes {
894 let raw_text_len = raw_text.len();
895 if raw_text_len != bytes {
896 warn!("Text length mismatch, attribute states {bytes}, but we got {raw_text_len}");
897 }
898 }
899 assert!(text.is_none());
900 if title.is_none() {
901 warn!("Page content is parsed before its title.");
902 }
903
904 debug!("Parsing '{}'", title.unwrap_or("<unknown>"));
905 let mut parser_errors = Vec::new();
906 let parsed_text = parse_wikitext(
907 &raw_text,
908 title.map(ToString::to_string).unwrap_or_default(),
909 |error| parser_errors.push(error),
910 );
911
912 let page_name = title.map(ToString::to_string).unwrap_or_default();
913
914 let mut word_errors = Vec::new();
915 wikitext_to_words(&page_name, &parsed_text, &mut word_consumer, |error| {
916 word_errors.push(error)
917 })
918 .await?;
919
920 if !parser_errors.is_empty() || !word_errors.is_empty() {
921 debug!("Page '{page_name}' has {} errors", parser_errors.len());
922 writeln!(error_log, "Page: {page_name}")
923 .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
924 }
925 for error in &parser_errors {
926 writeln!(error_log, "{error:#?}")
927 .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
928 }
929 for error in &word_errors {
930 writeln!(error_log, "{error:#?}")
931 .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
932 }
933 if !parser_errors.is_empty() || !word_errors.is_empty() {
934 writeln!(error_log, "\nContent: {raw_text}\n")
935 .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
936 }
937
938 text = Some(parsed_text);
939 }
940 RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
941 }
942 }
943}
944
945async fn parse_string(
946 name: impl AsRef<[u8]>,
947 mut attributes: Attributes<'_>,
948 reader: &mut Reader<impl AsyncBufRead + Unpin>,
949 buffer: &mut Vec<u8>,
950) -> Result<String> {
951 let name = name.as_ref();
952 if let Some(attribute) = attributes.next() {
953 return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
954 }
955
956 let mut value = String::new();
957
958 loop {
959 match read_relevant_event(reader, buffer).await? {
960 RelevantEvent::Start(tag) => {
961 return Err(Error::Other(format!("Found unexpected tag {tag:?}")));
962 }
963 RelevantEvent::End(tag) => {
964 return if tag.name() == QName(name) {
965 Ok(value)
966 } else {
967 Err(Error::Other(format!(
968 "Found unexpected closing tag {tag:?}"
969 )))
970 };
971 }
972 RelevantEvent::Empty(tag) => {
973 warn!("{tag:?}")
974 }
975 RelevantEvent::Text(text) => value = text,
976 RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
977 }
978 }
979}