use std::path::PathBuf;
use anyhow::{Context, Result};
use clap::Parser;
use rayon::prelude::*;
use wikiext::{
OutputConfig, OutputFormat, OutputSplitter, clean_wikitext, format_page, open_dump,
parse_file_size,
};
#[derive(Parser)]
#[command(name = "wikiext", version, about)]
struct Cli {
input: PathBuf,
#[arg(short, long, default_value = "text")]
output: String,
#[arg(short, long, default_value = "1M")]
bytes: String,
#[arg(short, long)]
compress: bool,
#[arg(long)]
json: bool,
#[arg(long)]
processes: Option<usize>,
#[arg(short, long)]
quiet: bool,
#[arg(long, default_value = "0")]
namespaces: String,
}
const BATCH_SIZE: usize = 1000;
fn main() -> Result<()> {
env_logger::init();
let cli = Cli::parse();
let namespaces: Vec<i32> = cli
.namespaces
.split(',')
.map(|s| {
s.trim()
.parse::<i32>()
.with_context(|| format!("invalid namespace: '{s}'"))
})
.collect::<Result<Vec<_>>>()?;
let max_file_size = parse_file_size(&cli.bytes)
.with_context(|| format!("invalid bytes value: '{}'", cli.bytes))?;
if let Some(n) = cli.processes {
rayon::ThreadPoolBuilder::new()
.num_threads(n)
.build_global()
.with_context(|| "failed to configure thread pool")?;
}
let output_format = if cli.json {
OutputFormat::Json
} else {
OutputFormat::Doc
};
let mut dump_reader = open_dump(&cli.input, &namespaces)
.with_context(|| format!("failed to open dump: {:?}", cli.input))?;
let mut first_article = None;
for result in dump_reader.by_ref() {
match result {
Ok(article) => {
first_article = Some(article);
break;
}
Err(e) => {
eprintln!("warning: error reading page: {e}");
continue;
}
}
}
let url_base = dump_reader.url_base().to_string();
let config = OutputConfig {
path: PathBuf::from(&cli.output),
max_file_size,
compress: cli.compress,
};
let mut output = OutputSplitter::new(config).with_context(|| "failed to create output")?;
let mut total_pages: u64 = 0;
if let Some(article) = first_article {
let text = clean_wikitext(&article.text);
let formatted = format_page(article.id, &article.title, &url_base, &text, output_format);
output
.write(&formatted)
.with_context(|| "failed to write output")?;
total_pages += 1;
}
loop {
let batch: Vec<_> = dump_reader
.by_ref()
.filter_map(|result| match result {
Ok(article) => Some(article),
Err(e) => {
eprintln!("warning: error reading page: {e}");
None
}
})
.take(BATCH_SIZE)
.collect();
if batch.is_empty() {
break;
}
let batch_len = batch.len() as u64;
let results: Vec<String> = batch
.par_iter()
.map(|article| {
let text = clean_wikitext(&article.text);
format_page(article.id, &article.title, &url_base, &text, output_format)
})
.collect();
for formatted in &results {
output
.write(formatted)
.with_context(|| "failed to write output")?;
}
total_pages += batch_len;
if !cli.quiet {
eprint!("\r{total_pages} pages processed");
}
}
output.close().with_context(|| "failed to close output")?;
if !cli.quiet {
eprintln!("\r{total_pages} pages processed - done.");
}
Ok(())
}