wiktionary_dump_parser/parser/
mod.rs

1use crate::error::Result;
2use crate::parser::words::wikitext_to_words;
3use crate::parser::xml::{read_relevant_event, RelevantEvent};
4use crate::Error;
5use async_compression::tokio::bufread::BzDecoder;
6use log::{debug, info, trace, warn};
7use quick_xml::events::attributes::Attributes;
8use quick_xml::name::QName;
9use quick_xml::Reader;
10use serde::Deserialize;
11use serde::Serialize;
12use std::ffi::OsStr;
13use std::future::Future;
14use std::io::{Read, Write};
15use std::path::Path;
16use std::pin::Pin;
17use std::task::{Context, Poll};
18use tokio::fs::File;
19use tokio::io::{
20    AsyncBufRead, AsyncRead, AsyncSeekExt, AsyncWrite, AsyncWriteExt, BufReader, BufWriter, ReadBuf,
21};
22use tokio::time::Duration;
23use tokio::time::Instant;
24use wikitext_parser::{parse_wikitext, Wikitext};
25
26use self::words::Word;
27
28pub mod words;
29mod xml;
30
31#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
32pub struct Siteinfo {
33    sitename: String,
34    dbname: String,
35    base: String,
36    generator: String,
37    case: String,
38    namespaces: Vec<Namespace>,
39}
40
41#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
42pub struct Namespace {
43    key: i64,
44    case: String,
45    name: String,
46}
47
48struct TokioReadAdapter<R>(R);
49
50impl<R: Read + Unpin> AsyncRead for TokioReadAdapter<R> {
51    fn poll_read(
52        mut self: Pin<&mut Self>,
53        _cx: &mut Context<'_>,
54        buf: &mut ReadBuf<'_>,
55    ) -> Poll<std::io::Result<()>> {
56        let amount = self.0.read(buf.initialize_unfilled());
57        Poll::Ready(amount.map(|amount| {
58            buf.advance(amount);
59        }))
60    }
61}
62
63pub async fn parse_dump_file<
64    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
65>(
66    input_file: impl AsRef<Path>,
67    output_file: Option<impl AsRef<Path>>,
68    mut word_consumer: impl FnMut(Word) -> WordConsumerResult,
69    error_log: impl AsRef<Path>,
70    output_pretty: bool,
71) -> Result<()> {
72    let input_file = input_file.as_ref();
73    let output_file = output_file.as_ref();
74
75    if input_file.extension().map(OsStr::to_str) == Some(Some("bz2")) {
76        if input_file
77            .file_stem()
78            .map(|stem| stem.to_str().filter(|stem| stem.ends_with("xml")).is_some())
79            .is_none()
80        {
81            return Err(Error::Other(format!("Found a '.bz2' file extension that is not preceded by a '.xml' file extension in file {input_file:?}")));
82        }
83
84        debug!("Found file extension '.xml.bz2' for input file {input_file:?}");
85
86        let input_file = File::open(input_file).await?;
87        let input_size = input_file.metadata().await?.len();
88        let input_stream = BufReader::with_capacity(
89            1024 * 1024,
90            BzDecoder::new(BufReader::with_capacity(1024 * 1024, input_file)),
91        );
92
93        let output_stream = if let Some(output_file) = output_file {
94            Some(BufWriter::with_capacity(
95                1024 * 1024,
96                File::create(output_file).await?,
97            ))
98        } else {
99            None
100        };
101        let error_log = std::io::BufWriter::new(std::fs::File::create(error_log)?);
102
103        // File is compressed, so input size is not accurate
104        parse_dump_file_with_streams(
105            input_stream,
106            |input_stream| input_stream.get_mut().get_mut().get_mut(),
107            input_size,
108            output_stream,
109            &mut word_consumer,
110            error_log,
111            output_pretty,
112        )
113        .await?;
114    } else if input_file
115        .extension()
116        .filter(|extension| extension.to_str() == Some("xml"))
117        .is_some()
118    {
119        debug!("Found file extension '.xml' for input file {input_file:?}");
120
121        let input_file = File::open(input_file).await?;
122        let input_size = input_file.metadata().await?.len();
123        let input_stream = BufReader::with_capacity(1024 * 1024, input_file);
124        let output_stream = if let Some(output_file) = output_file {
125            Some(BufWriter::with_capacity(
126                1024 * 1024,
127                File::create(output_file).await?,
128            ))
129        } else {
130            None
131        };
132        let error_log = std::io::BufWriter::new(std::fs::File::create(error_log)?);
133
134        parse_dump_file_with_streams(
135            input_stream,
136            |input_stream| input_stream.get_mut(),
137            input_size,
138            output_stream,
139            &mut word_consumer,
140            error_log,
141            output_pretty,
142        )
143        .await?;
144    } else {
145        return Err(Error::Other(format!(
146            "Unknown file extension in file {input_file:?}"
147        )));
148    }
149
150    Ok(())
151}
152
153#[allow(clippy::type_complexity)]
154async fn parse_dump_file_with_streams<
155    InputStream: AsyncBufRead + Unpin,
156    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
157>(
158    input_stream: InputStream,
159    input_stream_to_file: impl Fn(&mut InputStream) -> &mut File,
160    input_size: u64,
161    mut output_stream: Option<impl AsyncWrite + Unpin>,
162    word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
163    mut error_log: impl Write,
164    output_pretty: bool,
165) -> Result<()> {
166    let mut reader = Reader::from_reader(input_stream);
167    let mut buffer = Vec::new();
168    let mut last_progress_log = Instant::now();
169    let mut tag_stack = Vec::new();
170    let mut json_buffer = Vec::new();
171
172    loop {
173        let current_time = Instant::now();
174        if current_time - last_progress_log >= Duration::from_secs(10) {
175            last_progress_log = current_time;
176
177            let input_file = input_stream_to_file(reader.get_mut());
178            let current = input_file.stream_position().await?;
179            let current_mib = current / (1024 * 1024);
180            let input_size_mib = input_size / (1024 * 1024);
181
182            info!("Parsing input file at {current_mib}/{input_size_mib}MiB");
183        }
184
185        let level = tag_stack.len();
186        match read_relevant_event(&mut reader, &mut buffer).await {
187            Ok(event) => match event {
188                RelevantEvent::Start(tag) => {
189                    let tag_name = String::from_utf8(tag.name().into_inner().to_vec())?;
190                    if level == 0 {
191                        if tag_name != "mediawiki" {
192                            return Err(Error::Other(format!(
193                                "Found unexpected toplevel tag {tag:?}"
194                            )));
195                        }
196                        tag_stack.push(tag_name);
197                    } else if level == 1 {
198                        match tag_name.as_str() {
199                            "siteinfo" => {
200                                let siteinfo =
201                                    parse_siteinfo(tag.attributes(), &mut reader, &mut buffer)
202                                        .await?;
203                                info!(
204                                    "{} ({} {})",
205                                    siteinfo.sitename, siteinfo.dbname, siteinfo.generator
206                                );
207                                if let Some(output_stream) = output_stream.as_mut() {
208                                    json_buffer.clear();
209                                    if output_pretty {
210                                        serde_json::to_writer_pretty(&mut json_buffer, &siteinfo)?;
211                                    } else {
212                                        serde_json::to_writer(&mut json_buffer, &siteinfo)?;
213                                    }
214                                    output_stream.write_all(&json_buffer).await?;
215                                }
216                            }
217                            "page" => {
218                                let page = parse_page(
219                                    tag.attributes(),
220                                    &mut reader,
221                                    word_consumer,
222                                    &mut buffer,
223                                    &mut error_log,
224                                )
225                                .await?;
226                                trace!("{page:?}");
227                                if let Some(output_stream) = output_stream.as_mut() {
228                                    json_buffer.clear();
229                                    if output_pretty {
230                                        serde_json::to_writer_pretty(&mut json_buffer, &page)?;
231                                    } else {
232                                        serde_json::to_writer(&mut json_buffer, &page)?;
233                                    }
234                                    output_stream.write_all(&json_buffer).await?;
235                                }
236                            }
237                            _ => {
238                                return Err(Error::Other(format!(
239                                    "Found unexpected level 1 tag {tag:?}"
240                                )))
241                            }
242                        }
243                    }
244                }
245                RelevantEvent::End(tag) => {
246                    let tag_name = String::from_utf8(tag.name().into_inner().to_vec())?;
247                    let stacked_tag = tag_stack
248                        .pop()
249                        .ok_or_else(|| Error::Other(format!("Unexpected closing tag {tag:?}")))?;
250                    if tag_name != stacked_tag {
251                        return Err(Error::Other(format!("Unexpected closing tag {tag:?}")));
252                    }
253                }
254                RelevantEvent::Empty(tag) => {
255                    return Err(Error::Other(format!("Unexpected empty tag {tag:?}")));
256                }
257                RelevantEvent::Text(text) => {
258                    return Err(Error::Other(format!("Unexpected text {text:?}")));
259                }
260                RelevantEvent::Eof => {
261                    if level > 0 {
262                        return Err(Error::Other(format!("Unexpected eof")));
263                    } else {
264                        break;
265                    }
266                }
267            },
268            Err(error) => return Err(error),
269        }
270    }
271
272    info!("Successfully parsed dump file");
273    Ok(())
274}
275
276async fn parse_siteinfo(
277    mut attributes: Attributes<'_>,
278    reader: &mut Reader<impl AsyncBufRead + Unpin>,
279    buffer: &mut Vec<u8>,
280) -> Result<Siteinfo> {
281    if let Some(attribute) = attributes.next() {
282        return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
283    }
284
285    let mut sitename = None;
286    let mut dbname = None;
287    let mut base = None;
288    let mut generator = None;
289    let mut case = None;
290    let mut namespaces = None;
291
292    loop {
293        match read_relevant_event(reader, buffer).await? {
294            RelevantEvent::Start(tag) => match tag.name().into_inner() {
295                b"sitename" => {
296                    sitename =
297                        Some(parse_string("sitename", tag.attributes(), reader, buffer).await?);
298                }
299                b"dbname" => {
300                    dbname = Some(parse_string("dbname", tag.attributes(), reader, buffer).await?);
301                }
302                b"base" => {
303                    base = Some(parse_string("base", tag.attributes(), reader, buffer).await?);
304                }
305                b"generator" => {
306                    generator =
307                        Some(parse_string("generator", tag.attributes(), reader, buffer).await?);
308                }
309                b"case" => {
310                    case = Some(parse_string("case", tag.attributes(), reader, buffer).await?);
311                }
312                b"namespaces" => {
313                    namespaces = Some(parse_namespaces(tag.attributes(), reader, buffer).await?);
314                }
315                _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
316            },
317            RelevantEvent::End(tag) => {
318                return if tag.name() == QName(b"siteinfo") {
319                    Ok(Siteinfo {
320                        sitename: if let Some(sitename) = sitename {
321                            sitename
322                        } else {
323                            return Err(Error::Other(format!("Missing sitename in siteinfo")));
324                        },
325                        dbname: if let Some(dbname) = dbname {
326                            dbname
327                        } else {
328                            return Err(Error::Other(format!("Missing dbname in siteinfo")));
329                        },
330                        base: if let Some(base) = base {
331                            base
332                        } else {
333                            return Err(Error::Other(format!("Missing base in siteinfo")));
334                        },
335                        generator: if let Some(generator) = generator {
336                            generator
337                        } else {
338                            return Err(Error::Other(format!("Missing generator in siteinfo")));
339                        },
340                        case: if let Some(case) = case {
341                            case
342                        } else {
343                            return Err(Error::Other(format!("Missing case in siteinfo")));
344                        },
345                        namespaces: if let Some(namespaces) = namespaces {
346                            namespaces
347                        } else {
348                            return Err(Error::Other(format!("Missing namespaces in siteinfo")));
349                        },
350                    })
351                } else {
352                    Err(Error::Other(format!(
353                        "Found unexpected closing tag {tag:?}"
354                    )))
355                };
356            }
357            RelevantEvent::Empty(tag) => {
358                warn!("{tag:?}")
359            }
360            RelevantEvent::Text(text) => {
361                warn!("{text:?}")
362            }
363            RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
364        }
365    }
366}
367
368async fn parse_namespaces(
369    mut attributes: Attributes<'_>,
370    reader: &mut Reader<impl AsyncBufRead + Unpin>,
371    buffer: &mut Vec<u8>,
372) -> Result<Vec<Namespace>> {
373    if let Some(attribute) = attributes.next() {
374        return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
375    }
376
377    struct NamespaceTag {
378        key: i64,
379        case: String,
380    }
381    let mut current_namespace_tag = None;
382    let mut namespaces = Vec::new();
383
384    loop {
385        match read_relevant_event(reader, buffer).await? {
386            RelevantEvent::Start(tag) => {
387                if tag.name() == QName(b"namespace") {
388                    if current_namespace_tag.is_some() {
389                        return Err(Error::Other(format!("Found nested namespace tag {tag:?}")));
390                    }
391
392                    current_namespace_tag = Some(NamespaceTag {
393                        key: String::from_utf8_lossy(
394                            &tag.try_get_attribute(b"key")?
395                                .ok_or_else(|| {
396                                    Error::Other(format!("Missing attribute key in {tag:?}"))
397                                })?
398                                .value,
399                        )
400                        .parse()
401                        .map_err(|_| Error::Other(format!("Key is not an integer in {tag:?}")))?,
402                        case: String::from_utf8_lossy(
403                            &tag.try_get_attribute(b"case")?
404                                .ok_or_else(|| {
405                                    Error::Other(format!("Missing attribute case in {tag:?}"))
406                                })?
407                                .value,
408                        )
409                        .into_owned(),
410                    });
411                } else {
412                    return Err(Error::Other(format!("Found unexpected tag {tag:?}")));
413                }
414            }
415            RelevantEvent::End(tag) => {
416                if tag.name() == QName(b"namespaces") {
417                    return Ok(namespaces);
418                } else if tag.name() == QName(b"namespace") {
419                    if current_namespace_tag.is_some() {
420                        return Err(Error::Other(format!(
421                            "Found namespace tag without text {tag:?}"
422                        )));
423                    }
424                } else {
425                    return Err(Error::Other(format!(
426                        "Found unexpected closing tag {tag:?}"
427                    )));
428                };
429            }
430            RelevantEvent::Empty(tag) => {
431                match tag.name().into_inner() {
432                    b"namespace" => { /* ignore nameless namespace */ }
433                    _ => warn!("{tag:?}"),
434                }
435            }
436            RelevantEvent::Text(text) => {
437                if let Some(current_namespace_tag) = current_namespace_tag {
438                    namespaces.push(Namespace {
439                        key: current_namespace_tag.key,
440                        case: current_namespace_tag.case,
441                        name: text,
442                    });
443                } else {
444                    return Err(Error::Other(format!(
445                        "Found text outside of namespace tag: {text:?}"
446                    )));
447                }
448
449                current_namespace_tag = None;
450            }
451            RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
452        }
453    }
454}
455
456#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
457pub struct Page {
458    title: String,
459    namespace: i64,
460    id: i64,
461    revision: Revision,
462    redirect: Option<String>,
463}
464
465async fn parse_page<
466    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
467>(
468    mut attributes: Attributes<'_>,
469    reader: &mut Reader<impl AsyncBufRead + Unpin>,
470    word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
471    buffer: &mut Vec<u8>,
472    error_log: &mut impl Write,
473) -> Result<Page> {
474    if let Some(attribute) = attributes.next() {
475        return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
476    }
477
478    let mut title = None;
479    let mut namespace = None;
480    let mut id = None;
481    let mut revision = None;
482    let mut redirect = None;
483
484    loop {
485        match read_relevant_event(reader, buffer).await? {
486            RelevantEvent::Start(tag) => match tag.name().into_inner() {
487                b"title" => {
488                    title = Some(parse_string("title", tag.attributes(), reader, buffer).await?);
489                }
490                b"ns" => {
491                    namespace = Some(
492                        parse_string("ns", tag.attributes(), reader, buffer)
493                            .await?
494                            .parse()
495                            .map_err(|_| {
496                                Error::Other(format!("ns is not an integer in {tag:?}"))
497                            })?,
498                    );
499                }
500                b"id" => {
501                    id = Some(
502                        parse_string("id", tag.attributes(), reader, buffer)
503                            .await?
504                            .parse()
505                            .map_err(|_| {
506                                Error::Other(format!("id is not an integer in {tag:?}"))
507                            })?,
508                    );
509                }
510                b"revision" => {
511                    revision = Some(
512                        parse_revision(
513                            tag.attributes(),
514                            title.clone(),
515                            reader,
516                            word_consumer,
517                            buffer,
518                            error_log,
519                        )
520                        .await?,
521                    );
522                }
523                _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
524            },
525            RelevantEvent::End(tag) => {
526                return if tag.name() == QName(b"page") {
527                    Ok(Page {
528                        title: if let Some(title) = title {
529                            title
530                        } else {
531                            return Err(Error::Other(format!("Missing title in page")));
532                        },
533                        namespace: if let Some(namespace) = namespace {
534                            namespace
535                        } else {
536                            return Err(Error::Other(format!("Missing namespace in page")));
537                        },
538                        id: if let Some(id) = id {
539                            id
540                        } else {
541                            return Err(Error::Other(format!("Missing id in page")));
542                        },
543                        revision: if let Some(revision) = revision {
544                            revision
545                        } else {
546                            return Err(Error::Other(format!("Missing revision in page")));
547                        },
548                        redirect,
549                    })
550                } else {
551                    Err(Error::Other(format!(
552                        "Found unexpected closing tag {tag:?}"
553                    )))
554                };
555            }
556            RelevantEvent::Empty(tag) => match tag.name().into_inner() {
557                b"redirect" => {
558                    for attribute in tag.attributes() {
559                        let attribute = attribute?;
560                        match attribute.key {
561                            QName(b"title") => {
562                                redirect = Some(String::from_utf8(attribute.value.to_vec())?);
563                            }
564                            _ => warn!("{tag:?} {attribute:?}"),
565                        }
566                    }
567                }
568                _ => warn!("{tag:?}"),
569            },
570            RelevantEvent::Text(text) => {
571                warn!("{text:?}")
572            }
573            RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
574        }
575    }
576}
577
578#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
579pub struct Revision {
580    id: i64,
581    parentid: Option<i64>,
582    timestamp: String,
583    contributor: Option<Contributor>,
584    comment: Option<String>,
585    model: String,
586    format: String,
587    text: Option<Text>,
588    sha1: String,
589    minor: bool,
590}
591
592async fn parse_revision<
593    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
594>(
595    mut attributes: Attributes<'_>,
596    title: Option<String>,
597    reader: &mut Reader<impl AsyncBufRead + Unpin>,
598    word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
599    buffer: &mut Vec<u8>,
600    error_log: &mut impl Write,
601) -> Result<Revision> {
602    if let Some(attribute) = attributes.next() {
603        return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
604    }
605
606    let mut id = None;
607    let mut parentid = None;
608    let mut timestamp = None;
609    let mut contributor = None;
610    let mut comment = None;
611    let mut model = None;
612    let mut format = None;
613    let mut text = None;
614    let mut sha1 = None;
615    let mut minor = false;
616
617    loop {
618        match read_relevant_event(reader, buffer).await? {
619            RelevantEvent::Start(tag) => match tag.name().into_inner() {
620                b"id" => {
621                    id = Some(
622                        parse_string("id", tag.attributes(), reader, buffer)
623                            .await?
624                            .parse()
625                            .map_err(|_| {
626                                Error::Other(format!("id is not an integer in {tag:?}"))
627                            })?,
628                    );
629                }
630                b"parentid" => {
631                    parentid = Some(
632                        parse_string("parentid", tag.attributes(), reader, buffer)
633                            .await?
634                            .parse()
635                            .map_err(|_| {
636                                Error::Other(format!("parentid is not an integer in {tag:?}"))
637                            })?,
638                    );
639                }
640                b"timestamp" => {
641                    timestamp =
642                        Some(parse_string("timestamp", tag.attributes(), reader, buffer).await?);
643                }
644                b"contributor" => {
645                    contributor = Some(parse_contributor(tag.attributes(), reader, buffer).await?);
646                }
647                b"comment" => {
648                    comment =
649                        Some(parse_string("comment", tag.attributes(), reader, buffer).await?);
650                }
651                b"model" => {
652                    model = Some(parse_string("model", tag.attributes(), reader, buffer).await?);
653                }
654                b"format" => {
655                    format = Some(parse_string("format", tag.attributes(), reader, buffer).await?);
656                }
657                b"text" => {
658                    text = Some(
659                        parse_text(
660                            tag.attributes(),
661                            title.as_deref(),
662                            reader,
663                            word_consumer,
664                            buffer,
665                            error_log,
666                        )
667                        .await?,
668                    );
669                }
670                b"sha1" => {
671                    sha1 = Some(parse_string("sha1", tag.attributes(), reader, buffer).await?);
672                }
673                _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
674            },
675            RelevantEvent::End(tag) => {
676                return if tag.name() == QName(b"revision") {
677                    if text.is_none() {
678                        debug!("No text for revision with id {id:?} and comment {comment:?}");
679                    }
680
681                    Ok(Revision {
682                        id: if let Some(id) = id {
683                            id
684                        } else {
685                            return Err(Error::Other(format!("Missing id in revision")));
686                        },
687                        parentid,
688                        timestamp: if let Some(timestamp) = timestamp {
689                            timestamp
690                        } else {
691                            return Err(Error::Other(format!("Missing timestamp in revision")));
692                        },
693                        contributor,
694                        comment,
695                        model: if let Some(model) = model {
696                            model
697                        } else {
698                            return Err(Error::Other(format!("Missing model in revision")));
699                        },
700                        format: if let Some(format) = format {
701                            format
702                        } else {
703                            return Err(Error::Other(format!("Missing format in revision")));
704                        },
705                        text,
706                        sha1: if let Some(sha1) = sha1 {
707                            sha1
708                        } else {
709                            return Err(Error::Other(format!("Missing sha1 in revision")));
710                        },
711                        minor,
712                    })
713                } else {
714                    Err(Error::Other(format!(
715                        "Found unexpected closing tag {tag:?}"
716                    )))
717                };
718            }
719            RelevantEvent::Empty(tag) => {
720                match tag.name().into_inner() {
721                    b"minor" => {
722                        minor = true;
723                    }
724                    b"comment" => { /* ignore empty comment */ }
725                    b"text" => { /* ignore empty text */ }
726                    b"contributor" => { /* ignore empty contributor */ }
727                    _ => warn!("{tag:?}"),
728                }
729            }
730            RelevantEvent::Text(text) => {
731                warn!("{text:?}")
732            }
733            RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
734        }
735    }
736}
737
738#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
739pub enum Contributor {
740    User { username: String, id: i64 },
741    Anonymous { ip: String },
742}
743
744async fn parse_contributor(
745    mut attributes: Attributes<'_>,
746    reader: &mut Reader<impl AsyncBufRead + Unpin>,
747    buffer: &mut Vec<u8>,
748) -> Result<Contributor> {
749    if let Some(attribute) = attributes.next() {
750        return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
751    }
752
753    let mut username = None;
754    let mut id: Option<i64> = None;
755    let mut ip = None;
756
757    loop {
758        match read_relevant_event(reader, buffer).await? {
759            RelevantEvent::Start(tag) => match tag.name().into_inner() {
760                b"username" => {
761                    username =
762                        Some(parse_string("username", tag.attributes(), reader, buffer).await?);
763                }
764                b"id" => {
765                    id = Some(
766                        parse_string("id", tag.attributes(), reader, buffer)
767                            .await?
768                            .parse()
769                            .map_err(|_| {
770                                Error::Other(format!("id is not an integer in {tag:?}"))
771                            })?,
772                    );
773                }
774                b"ip" => {
775                    ip = Some(parse_string("ip", tag.attributes(), reader, buffer).await?);
776                }
777                _ => return Err(Error::Other(format!("Found unexpected tag {tag:?}"))),
778            },
779            RelevantEvent::End(tag) => {
780                return if tag.name() == QName(b"contributor") {
781                    if let (Some(username), Some(id), None) = (&username, &id, &ip) {
782                        Ok(Contributor::User {
783                            username: username.clone(),
784                            id: *id,
785                        })
786                    } else if let (None, None, Some(ip)) = (&username, &id, &ip) {
787                        Ok(Contributor::Anonymous { ip: ip.clone() })
788                    } else {
789                        Err(Error::Other(format!("Unknown combination of fields for contributor: {username:?}, {id:?}, {ip:?}")))
790                    }
791                } else {
792                    Err(Error::Other(format!(
793                        "Found unexpected closing tag {tag:?}"
794                    )))
795                };
796            }
797            RelevantEvent::Empty(tag) => {
798                warn!("{tag:?}")
799            }
800            RelevantEvent::Text(text) => {
801                warn!("{text:?}")
802            }
803            RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
804        }
805    }
806}
807
808#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
809pub struct Text {
810    xml_space: XmlSpace,
811    text: Wikitext,
812}
813
814#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
815pub enum XmlSpace {
816    Preserve,
817}
818
819async fn parse_text<
820    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
821>(
822    attributes: Attributes<'_>,
823    title: Option<&str>,
824    reader: &mut Reader<impl AsyncBufRead + Unpin>,
825    mut word_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
826    buffer: &mut Vec<u8>,
827    error_log: &mut impl Write,
828) -> Result<Text> {
829    let mut bytes: Option<usize> = None;
830    let mut xml_space = None;
831
832    for attribute in attributes {
833        let attribute = attribute?;
834        match attribute.key.into_inner() {
835            b"bytes" => {
836                bytes = Some(
837                    String::from_utf8(attribute.value.to_vec())?
838                        .parse()
839                        .map_err(|_| {
840                            Error::Other(format!("bytes is not an integer in {attribute:?}"))
841                        })?,
842                );
843            }
844            b"xml:space" => {
845                xml_space = Some(match attribute.value.as_ref() {
846                    b"preserve" => XmlSpace::Preserve,
847                    _ => {
848                        return Err(Error::Other(format!(
849                            "Found unexpected attribute value {attribute:?}"
850                        )))
851                    }
852                });
853            }
854            _ => {
855                return Err(Error::Other(format!(
856                    "Found unexpected attribute {attribute:?}"
857                )))
858            }
859        }
860    }
861
862    let mut text = None;
863
864    loop {
865        match read_relevant_event(reader, buffer).await? {
866            RelevantEvent::Start(tag) => {
867                return Err(Error::Other(format!("Found unexpected tag {tag:?}")));
868            }
869            RelevantEvent::End(tag) => {
870                return if tag.name() == QName(b"text") {
871                    Ok(Text {
872                        xml_space: if let Some(xml_space) = xml_space {
873                            xml_space
874                        } else {
875                            return Err(Error::Other(format!("Missing tag xml:space in text")));
876                        },
877                        text: if let Some(text) = text {
878                            text
879                        } else {
880                            return Err(Error::Other(format!("Missing text in text")));
881                        },
882                    })
883                } else {
884                    Err(Error::Other(format!(
885                        "Found unexpected closing tag {tag:?}"
886                    )))
887                };
888            }
889            RelevantEvent::Empty(tag) => {
890                warn!("{tag:?}")
891            }
892            RelevantEvent::Text(raw_text) => {
893                if let Some(bytes) = bytes {
894                    let raw_text_len = raw_text.len();
895                    if raw_text_len != bytes {
896                        warn!("Text length mismatch, attribute states {bytes}, but we got {raw_text_len}");
897                    }
898                }
899                assert!(text.is_none());
900                if title.is_none() {
901                    warn!("Page content is parsed before its title.");
902                }
903
904                debug!("Parsing '{}'", title.unwrap_or("<unknown>"));
905                let mut parser_errors = Vec::new();
906                let parsed_text = parse_wikitext(
907                    &raw_text,
908                    title.map(ToString::to_string).unwrap_or_default(),
909                    |error| parser_errors.push(error),
910                );
911
912                let page_name = title.map(ToString::to_string).unwrap_or_default();
913
914                let mut word_errors = Vec::new();
915                wikitext_to_words(&page_name, &parsed_text, &mut word_consumer, |error| {
916                    word_errors.push(error)
917                })
918                .await?;
919
920                if !parser_errors.is_empty() || !word_errors.is_empty() {
921                    debug!("Page '{page_name}' has {} errors", parser_errors.len());
922                    writeln!(error_log, "Page: {page_name}")
923                        .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
924                }
925                for error in &parser_errors {
926                    writeln!(error_log, "{error:#?}")
927                        .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
928                }
929                for error in &word_errors {
930                    writeln!(error_log, "{error:#?}")
931                        .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
932                }
933                if !parser_errors.is_empty() || !word_errors.is_empty() {
934                    writeln!(error_log, "\nContent: {raw_text}\n")
935                        .unwrap_or_else(|error| panic!("Writing to error log failed: {error}"));
936                }
937
938                text = Some(parsed_text);
939            }
940            RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
941        }
942    }
943}
944
945async fn parse_string(
946    name: impl AsRef<[u8]>,
947    mut attributes: Attributes<'_>,
948    reader: &mut Reader<impl AsyncBufRead + Unpin>,
949    buffer: &mut Vec<u8>,
950) -> Result<String> {
951    let name = name.as_ref();
952    if let Some(attribute) = attributes.next() {
953        return Err(Error::Other(format!("Unexpected attribute {attribute:?}")));
954    }
955
956    let mut value = String::new();
957
958    loop {
959        match read_relevant_event(reader, buffer).await? {
960            RelevantEvent::Start(tag) => {
961                return Err(Error::Other(format!("Found unexpected tag {tag:?}")));
962            }
963            RelevantEvent::End(tag) => {
964                return if tag.name() == QName(name) {
965                    Ok(value)
966                } else {
967                    Err(Error::Other(format!(
968                        "Found unexpected closing tag {tag:?}"
969                    )))
970                };
971            }
972            RelevantEvent::Empty(tag) => {
973                warn!("{tag:?}")
974            }
975            RelevantEvent::Text(text) => value = text,
976            RelevantEvent::Eof => return Err(Error::Other(format!("Unexpected eof"))),
977        }
978    }
979}