use std::collections::HashSet;
use std::io::BufRead;
use std::io::BufReader;
use std::path::Path;
use quick_xml::Reader;
use quick_xml::events::Event;
use crate::error::Error;
#[derive(Clone, Copy, PartialEq, Eq)]
enum PageTag {
Title,
Ns,
Id,
Text,
Other,
}
impl PageTag {
#[inline]
fn from_bytes(b: &[u8]) -> Self {
match b {
b"title" => PageTag::Title,
b"ns" => PageTag::Ns,
b"id" => PageTag::Id,
b"text" => PageTag::Text,
_ => PageTag::Other,
}
}
#[inline]
fn as_str(self) -> &'static str {
match self {
PageTag::Title => "title",
PageTag::Ns => "ns",
PageTag::Id => "id",
PageTag::Text => "text",
PageTag::Other => "unknown",
}
}
}
#[derive(Debug, Clone)]
pub struct Article {
pub id: u64,
pub title: String,
pub namespace: i32,
pub text: String,
}
pub struct DumpReader<R: BufRead> {
reader: Reader<R>,
buf: Vec<u8>,
namespaces: HashSet<i32>,
url_base: String,
initialized: bool,
}
impl<R: BufRead> DumpReader<R> {
pub fn new(reader: R, namespaces: &[i32]) -> Self {
let mut xml_reader = Reader::from_reader(reader);
xml_reader.config_mut().trim_text(true);
Self {
reader: xml_reader,
buf: Vec::new(),
namespaces: namespaces.iter().copied().collect(),
url_base: String::new(),
initialized: false,
}
}
pub fn url_base(&self) -> &str {
&self.url_base
}
fn parse_siteinfo(&mut self) -> Result<(), Error> {
let mut in_base = false;
loop {
self.buf.clear();
match self.reader.read_event_into(&mut self.buf) {
Ok(Event::Start(ref e)) => {
if e.local_name().as_ref() == b"base" {
in_base = true;
}
}
Ok(Event::Text(ref e)) if in_base => {
let base_text = e
.decode()
.map_err(|err| Error::Xml(format!("failed to decode base text: {err}")))?
.to_string();
if let Some(pos) = base_text.rfind('/') {
self.url_base = base_text[..pos].to_string();
} else {
self.url_base = base_text;
}
}
Ok(Event::End(ref e)) => {
if e.local_name().as_ref() == b"base" {
in_base = false;
} else if e.local_name().as_ref() == b"siteinfo" {
return Ok(());
}
}
Ok(Event::Eof) => {
return Err(Error::Xml(
"unexpected EOF while parsing siteinfo".to_string(),
));
}
Err(e) => return Err(Error::XmlReader(e)),
_ => {}
}
}
}
fn ensure_initialized(&mut self) -> Result<(), Error> {
if self.initialized {
return Ok(());
}
self.initialized = true;
loop {
self.buf.clear();
match self.reader.read_event_into(&mut self.buf) {
Ok(Event::Start(ref e)) => {
let name = e.local_name();
if name.as_ref() == b"siteinfo" {
self.parse_siteinfo()?;
return Ok(());
}
if name.as_ref() == b"page" {
return Ok(());
}
}
Ok(Event::Eof) => return Ok(()),
Err(e) => return Err(Error::XmlReader(e)),
_ => {}
}
}
}
fn parse_page(&mut self) -> Result<Option<Article>, Error> {
let mut title = String::new();
let mut id: Option<u64> = None;
let mut ns: Option<i32> = None;
let mut text = String::new();
let mut current_tag = PageTag::Other;
let mut in_revision = false;
let mut page_id_captured = false;
loop {
self.buf.clear();
match self.reader.read_event_into(&mut self.buf) {
Ok(Event::Start(ref e)) => {
let local = e.local_name();
let raw = local.as_ref();
current_tag = PageTag::from_bytes(raw);
if raw == b"revision" {
in_revision = true;
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let tag = local.as_ref();
if tag == b"revision" {
in_revision = false;
} else if tag == b"page" {
let namespace = ns.unwrap_or(0);
if !self.namespaces.contains(&namespace) {
return Ok(None);
}
let page_id = match id {
Some(v) => v,
None => {
log::warn!("page missing <id>, skipping");
return Ok(None);
}
};
return Ok(Some(Article {
id: page_id,
title,
namespace,
text,
}));
}
current_tag = PageTag::Other;
}
Ok(Event::Text(ref e)) => {
let value = e
.decode()
.map_err(|err| {
Error::Xml(format!(
"failed to decode text in <{}>: {err}",
current_tag.as_str()
))
})?
.into_owned();
match current_tag {
PageTag::Title => title = value,
PageTag::Ns => {
ns = Some(value.parse::<i32>().map_err(|err| {
Error::Xml(format!("invalid namespace value '{value}': {err}"))
})?);
}
PageTag::Id if !in_revision && !page_id_captured => {
id = Some(value.parse::<u64>().map_err(|err| {
Error::Xml(format!("invalid page id '{value}': {err}"))
})?);
page_id_captured = true;
}
PageTag::Text if in_revision => {
text = value;
}
_ => {}
}
}
Ok(Event::CData(ref e)) => {
if current_tag == PageTag::Text && in_revision {
text = String::from_utf8_lossy(e.as_ref()).into_owned();
}
}
Ok(Event::Eof) => {
return Err(Error::Xml("unexpected EOF inside <page>".to_string()));
}
Err(e) => return Err(Error::XmlReader(e)),
_ => {}
}
}
}
}
impl<R: BufRead> Iterator for DumpReader<R> {
type Item = Result<Article, Error>;
fn next(&mut self) -> Option<Self::Item> {
if let Err(e) = self.ensure_initialized() {
return Some(Err(e));
}
loop {
self.buf.clear();
match self.reader.read_event_into(&mut self.buf) {
Ok(Event::Start(ref e)) => {
if e.local_name().as_ref() == b"page" {
match self.parse_page() {
Ok(Some(article)) => return Some(Ok(article)),
Ok(None) => continue, Err(e) => {
log::warn!("error parsing page: {e}");
continue;
}
}
}
}
Ok(Event::Eof) => return None,
Err(e) => return Some(Err(Error::XmlReader(e))),
_ => {}
}
}
}
}
const BUF_READER_CAPACITY: usize = 256 * 1024;
pub fn open_dump(path: &Path, namespaces: &[i32]) -> Result<DumpReader<Box<dyn BufRead>>, Error> {
let file = std::fs::File::open(path)?;
let buf_reader = BufReader::with_capacity(BUF_READER_CAPACITY, file);
let reader: Box<dyn BufRead> = if path
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("bz2"))
{
Box::new(BufReader::with_capacity(
BUF_READER_CAPACITY,
bzip2::bufread::MultiBzDecoder::new(buf_reader),
))
} else {
Box::new(buf_reader)
};
Ok(DumpReader::new(reader, namespaces))
}
#[cfg(test)]
mod tests {
use super::*;
fn reader_from_xml(xml: &str, namespaces: &[i32]) -> DumpReader<Box<dyn BufRead>> {
let cursor = std::io::Cursor::new(xml.to_string().into_bytes());
let boxed: Box<dyn BufRead> = Box::new(BufReader::new(cursor));
DumpReader::new(boxed, namespaces)
}
const SINGLE_PAGE_XML: &str = r#"<mediawiki>
<siteinfo>
<sitename>Wikipedia</sitename>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
</siteinfo>
<page>
<title>Test Article</title>
<ns>0</ns>
<id>42</id>
<revision>
<id>999</id>
<text>Hello, world!</text>
</revision>
</page>
</mediawiki>"#;
#[test]
fn test_parse_single_page() {
let mut reader = reader_from_xml(SINGLE_PAGE_XML, &[0]);
let article = reader.next().unwrap().unwrap();
assert_eq!(article.id, 42);
assert_eq!(article.title, "Test Article");
assert_eq!(article.namespace, 0);
assert_eq!(article.text, "Hello, world!");
assert!(reader.next().is_none());
}
#[test]
fn test_namespace_filtering() {
let xml = r#"<mediawiki>
<siteinfo>
<sitename>Wikipedia</sitename>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
</siteinfo>
<page>
<title>Main Article</title>
<ns>0</ns>
<id>1</id>
<revision>
<id>100</id>
<text>Main content</text>
</revision>
</page>
<page>
<title>Talk:Main Article</title>
<ns>1</ns>
<id>2</id>
<revision>
<id>101</id>
<text>Talk content</text>
</revision>
</page>
<page>
<title>Another Article</title>
<ns>0</ns>
<id>3</id>
<revision>
<id>102</id>
<text>Another content</text>
</revision>
</page>
</mediawiki>"#;
let reader = reader_from_xml(xml, &[0]);
let articles: Vec<Article> = reader.map(|r| r.unwrap()).collect();
assert_eq!(articles.len(), 2);
assert_eq!(articles[0].id, 1);
assert_eq!(articles[1].id, 3);
}
#[test]
fn test_url_base_extraction() {
let mut reader = reader_from_xml(SINGLE_PAGE_XML, &[0]);
let _ = reader.next();
assert_eq!(reader.url_base(), "https://en.wikipedia.org/wiki");
}
#[test]
fn test_multiple_pages() {
let xml = r#"<mediawiki>
<siteinfo>
<sitename>Wikipedia</sitename>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
</siteinfo>
<page>
<title>First</title>
<ns>0</ns>
<id>10</id>
<revision>
<id>200</id>
<text>First text</text>
</revision>
</page>
<page>
<title>Second</title>
<ns>0</ns>
<id>20</id>
<revision>
<id>201</id>
<text>Second text</text>
</revision>
</page>
<page>
<title>Third</title>
<ns>0</ns>
<id>30</id>
<revision>
<id>202</id>
<text>Third text</text>
</revision>
</page>
</mediawiki>"#;
let reader = reader_from_xml(xml, &[0]);
let articles: Vec<Article> = reader.map(|r| r.unwrap()).collect();
assert_eq!(articles.len(), 3);
assert_eq!(articles[0].id, 10);
assert_eq!(articles[0].title, "First");
assert_eq!(articles[1].id, 20);
assert_eq!(articles[1].title, "Second");
assert_eq!(articles[2].id, 30);
assert_eq!(articles[2].title, "Third");
}
#[test]
fn test_redirect_page() {
let xml = r#"<mediawiki>
<siteinfo>
<sitename>Wikipedia</sitename>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
</siteinfo>
<page>
<title>Redirect Page</title>
<ns>0</ns>
<id>50</id>
<redirect title="Target Page" />
<revision>
<id>300</id>
<text>#REDIRECT [[Target Page]]</text>
</revision>
</page>
</mediawiki>"#;
let mut reader = reader_from_xml(xml, &[0]);
let article = reader.next().unwrap().unwrap();
assert_eq!(article.id, 50);
assert_eq!(article.title, "Redirect Page");
assert_eq!(article.text, "#REDIRECT [[Target Page]]");
}
#[test]
fn test_missing_text() {
let xml = r#"<mediawiki>
<siteinfo>
<sitename>Wikipedia</sitename>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
</siteinfo>
<page>
<title>No Text Page</title>
<ns>0</ns>
<id>60</id>
<revision>
<id>400</id>
</revision>
</page>
</mediawiki>"#;
let mut reader = reader_from_xml(xml, &[0]);
let article = reader.next().unwrap().unwrap();
assert_eq!(article.id, 60);
assert_eq!(article.title, "No Text Page");
assert_eq!(article.text, "");
}
}